bitkeeper revision 1.1108.44.1 (410a9819I8nRg4cqB44agt4G5KysGA)
authorcl349@freefall.cl.cam.ac.uk <cl349@freefall.cl.cam.ac.uk>
Fri, 30 Jul 2004 18:48:57 +0000 (18:48 +0000)
committercl349@freefall.cl.cam.ac.uk <cl349@freefall.cl.cam.ac.uk>
Fri, 30 Jul 2004 18:48:57 +0000 (18:48 +0000)
add network backend driver for 2.6

17 files changed:
.rootkeys
linux-2.4.26-xen-sparse/mkbuildtree
linux-2.6.7-xen-sparse/arch/xen/i386/mm/hypervisor.c
linux-2.6.7-xen-sparse/drivers/xen/Makefile
linux-2.6.7-xen-sparse/drivers/xen/net/Kconfig [deleted file]
linux-2.6.7-xen-sparse/drivers/xen/net/Makefile [deleted file]
linux-2.6.7-xen-sparse/drivers/xen/net/network.c [deleted file]
linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/netback/common.h [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/netback/control.c [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/netfront/Kconfig [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile [new file with mode: 0644]
linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c [new file with mode: 0644]
linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/hypervisor.h
linux-2.6.7-xen-sparse/mm/page_alloc.c [new file with mode: 0644]

index 12babc83d82fe126aea33d2b71007ff87609ea21..b35246e7c7135603fab2671035b64f9c4be812cb 100644 (file)
--- a/.rootkeys
+++ b/.rootkeys
 3e5a4e651TH-SXHoufurnWjgl5bfOA linux-2.6.7-xen-sparse/drivers/xen/console/console.c
 40f56239KYxO0YabhPzCTeUuln-lnA linux-2.6.7-xen-sparse/drivers/xen/evtchn/Makefile
 40f56239DoibTX6R-ZYd3QTXAB8_TA linux-2.6.7-xen-sparse/drivers/xen/evtchn/evtchn.c
-40f56239lrg_Ob0BJ8WBFS1zeg2CYw linux-2.6.7-xen-sparse/drivers/xen/net/Kconfig
-40f56239Wd4k_ycG_mFsSO1r5xKdtQ linux-2.6.7-xen-sparse/drivers/xen/net/Makefile
-405853f6nbeazrNyEWNHBuoSg2PiPA linux-2.6.7-xen-sparse/drivers/xen/net/network.c
+410a9817HEVJvred5Oy_uKH3HFJC5Q linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile
+4097ba831lpGeLlPg-bfV8XarVVuoQ linux-2.6.7-xen-sparse/drivers/xen/netback/common.h
+4097ba83wvv8yi5P5xugCUBAdb6O-A linux-2.6.7-xen-sparse/drivers/xen/netback/control.c
+4097ba83byY5bTSugJGZ1exTxIcMKw linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c
+4087cf0dGmSbFhFZyIZBJzvqxY-qBw linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c
+40f56239lrg_Ob0BJ8WBFS1zeg2CYw linux-2.6.7-xen-sparse/drivers/xen/netfront/Kconfig
+40f56239Wd4k_ycG_mFsSO1r5xKdtQ linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile
+405853f6nbeazrNyEWNHBuoSg2PiPA linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c
 4108f5c1ppFXVpQzCOAZ6xXYubsjKA linux-2.6.7-xen-sparse/drivers/xen/privcmd/Makefile
 3e5a4e65IUfzzMu2kZFlGEB8-rpTaA linux-2.6.7-xen-sparse/drivers/xen/privcmd/privcmd.c
 40f56239YAjS52QG2FIAQpHDZAdGHg linux-2.6.7-xen-sparse/include/asm-xen/asm-i386/desc.h
 40f5623cBiQhPHILVLrl3xa6bDBaRg linux-2.6.7-xen-sparse/include/asm-xen/xen.h
 3f689063BoW-HWV3auUJ-OqXfcGArw linux-2.6.7-xen-sparse/include/asm-xen/xen_proc.h
 40f56a0ddHCSs3501MY4hRf22tctOw linux-2.6.7-xen-sparse/mkbuildtree
+410a94a4KT6I6X0LVc7djB39tRDp4g linux-2.6.7-xen-sparse/mm/page_alloc.c
 40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Make.defs
 3f776bd1Hy9rn69ntXBhPReUFw9IEA tools/Makefile
 401d7e160vaxMBAUSLSicuZ7AQjJ3w tools/examples/Makefile
index a2f3a2cd80e540d38dc4a484762310bbf517b425..46b992a988203910af988a2251ff2cb3aa47f9ee 100755 (executable)
@@ -238,4 +238,4 @@ cd ${AD}/arch/xen/drivers/dom0
 ln -sf ../../../../${LINUX_26}/drivers/xen/privcmd/privcmd.c core.c
 
 cd ${AD}/arch/xen/drivers/netif/frontend
-ln -sf ../../../../../${LINUX_26}/drivers/xen/net/network.c main.c
+ln -sf ../../../../../${LINUX_26}/drivers/xen/netfront/netfront.c main.c
index 35e8c3e3124d0c6f228dade388c0d20a93aa2744..173a6a08917f9735b12a487542887af6f241aa12 100644 (file)
@@ -8,6 +8,8 @@
 
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
 #include <asm/hypervisor.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -126,8 +128,11 @@ static inline void __flush_page_update_queue(void)
 #endif
     idx = 0;
     wmb(); /* Make sure index is cleared first to avoid double updates. */
-    if (unlikely(HYPERVISOR_mmu_update(update_queue, _idx, NULL) < 0))
-       panic("Failed to execute MMU updates");
+    if ( unlikely(HYPERVISOR_mmu_update(update_queue, _idx, NULL) < 0) )
+    {
+        printk(KERN_ALERT "Failed to execute MMU updates.\n");
+        BUG();
+    }
 }
 
 void _flush_page_update_queue(void)
@@ -262,3 +267,91 @@ void queue_machphys_update(unsigned long mfn, unsigned long pfn)
     increment_index();
     spin_unlock_irqrestore(&update_lock, flags);
 }
+
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+
+unsigned long allocate_empty_lowmem_region(unsigned long pages)
+{
+    pgd_t         *pgd; 
+    pmd_t         *pmd;
+    pte_t         *pte;
+    unsigned long *pfn_array;
+    unsigned long  vstart;
+    unsigned long  i;
+    int            ret;
+    unsigned int   order = get_order(pages*PAGE_SIZE);
+
+    vstart = __get_free_pages(GFP_KERNEL, order);
+    if ( vstart == 0 )
+        return 0UL;
+
+    pfn_array = vmalloc((1<<order) * sizeof(*pfn_array));
+    if ( pfn_array == NULL )
+        BUG();
+
+    for ( i = 0; i < (1<<order); i++ )
+    {
+        pgd = pgd_offset_k(   (vstart + (i*PAGE_SIZE)));
+        pmd = pmd_offset(pgd, (vstart + (i*PAGE_SIZE)));
+        pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); 
+        pfn_array[i] = pte->pte_low >> PAGE_SHIFT;
+        queue_l1_entry_update(pte, 0);
+        phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = 0xdeadbeef;
+    }
+
+    flush_page_update_queue();
+
+    ret = HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, 
+                                pfn_array, 1<<order);
+    if ( unlikely(ret != (1<<order)) )
+    {
+        printk(KERN_WARNING "Unable to reduce memory reservation (%d)\n", ret);
+        BUG();
+    }
+
+    vfree(pfn_array);
+
+    return vstart;
+}
+
+void deallocate_lowmem_region(unsigned long vstart, unsigned long pages)
+{
+    pgd_t         *pgd; 
+    pmd_t         *pmd;
+    pte_t         *pte;
+    unsigned long *pfn_array;
+    unsigned long  i;
+    int            ret;
+    unsigned int   order = get_order(pages*PAGE_SIZE);
+
+    pfn_array = vmalloc((1<<order) * sizeof(*pfn_array));
+    if ( pfn_array == NULL )
+        BUG();
+
+    ret = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation,
+                                pfn_array, 1<<order);
+    if ( unlikely(ret != (1<<order)) )
+    {
+        printk(KERN_WARNING "Unable to increase memory reservation (%d)\n",
+               ret);
+        BUG();
+    }
+
+    for ( i = 0; i < (1<<order); i++ )
+    {
+        pgd = pgd_offset_k(   (vstart + (i*PAGE_SIZE)));
+        pmd = pmd_offset(pgd, (vstart + (i*PAGE_SIZE)));
+        pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE)));
+        queue_l1_entry_update(pte, (pfn_array[i]<<PAGE_SHIFT)|__PAGE_KERNEL);
+        queue_machphys_update(pfn_array[i], __pa(vstart)>>PAGE_SHIFT);
+        phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = pfn_array[i];
+    }
+
+    flush_page_update_queue();
+
+    vfree(pfn_array);
+
+    free_pages(vstart, order);
+}
+
+#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
index 2b3219fd4ed32455f81a9ae2daf22c14525e1073..ae06ee79a7f72d791d4b782f7cf4260c5b29bcdc 100644 (file)
@@ -3,5 +3,7 @@
 obj-y  += block/
 obj-y  += console/
 obj-y  += evtchn/
-obj-y  += net/
+obj-y  += netfront/
 obj-y  += privcmd/
+
+obj-$(CONFIG_XEN_PHYSDEV_ACCESS)       += netback/
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/net/Kconfig b/linux-2.6.7-xen-sparse/drivers/xen/net/Kconfig
deleted file mode 100644 (file)
index 334e6c3..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-
-config XENNET
-       tristate "Xen network driver"
-       depends on NETDEVICES && ARCH_XEN
-       help
-         Network driver for Xen
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/net/Makefile b/linux-2.6.7-xen-sparse/drivers/xen/net/Makefile
deleted file mode 100644 (file)
index 694dd83..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-
-obj-y  := network.o
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/net/network.c b/linux-2.6.7-xen-sparse/drivers/xen/net/network.c
deleted file mode 100644 (file)
index 9381595..0000000
+++ /dev/null
@@ -1,882 +0,0 @@
-/******************************************************************************
- * Virtual network driver for conversing with remote driver backends.
- * 
- * Copyright (c) 2002-2004, K A Fraser
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/version.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/skbuff.h>
-#include <linux/init.h>
-
-#include <asm/io.h>
-#include <net/sock.h>
-#include <net/pkt_sched.h>
-
-#include <asm-xen/evtchn.h>
-#include <asm-xen/ctrl_if.h>
-
-#include <asm/page.h>
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
-#include <asm-xen/netif.h>
-#else
-#include "../netif.h"
-#define irqreturn_t void
-#define IRQ_HANDLED
-#endif
-
-#define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */
-
-static void network_tx_buf_gc(struct net_device *dev);
-static void network_alloc_rx_buffers(struct net_device *dev);
-
-static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE];
-static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1];
-static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE];
-
-static struct list_head dev_list;
-
-struct net_private
-{
-    struct list_head list;
-    struct net_device *dev;
-
-    struct net_device_stats stats;
-    NETIF_RING_IDX rx_resp_cons, tx_resp_cons;
-    unsigned int tx_full;
-    
-    netif_tx_interface_t *tx;
-    netif_rx_interface_t *rx;
-
-    spinlock_t   tx_lock;
-    spinlock_t   rx_lock;
-
-    unsigned int handle;
-    unsigned int evtchn;
-    unsigned int irq;
-
-    /* What is the status of our connection to the remote backend? */
-#define BEST_CLOSED       0
-#define BEST_DISCONNECTED 1
-#define BEST_CONNECTED    2
-    unsigned int backend_state;
-
-    /* Is this interface open or closed (down or up)? */
-#define UST_CLOSED        0
-#define UST_OPEN          1
-    unsigned int user_state;
-
-    /*
-     * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
-     * array is an index into a chain of free entries.
-     */
-    struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1];
-    struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1];
-};
-
-/* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */
-#define ADD_ID_TO_FREELIST(_list, _id)             \
-    (_list)[(_id)] = (_list)[0];                   \
-    (_list)[0]     = (void *)(unsigned long)(_id);
-#define GET_ID_FROM_FREELIST(_list)                \
- ({ unsigned long _id = (unsigned long)(_list)[0]; \
-    (_list)[0]  = (_list)[_id];                    \
-    (unsigned short)_id; })
-
-static struct net_device *find_dev_by_handle(unsigned int handle)
-{
-    struct list_head *ent;
-    struct net_private *np;
-    list_for_each ( ent, &dev_list )
-    {
-        np = list_entry(ent, struct net_private, list);
-        if ( np->handle == handle )
-            return np->dev;
-    }
-    return NULL;
-}
-
-/** Network interface info. */
-struct netif_ctrl {
-    /** Number of interfaces. */
-    int interface_n;
-    /** Number of connected interfaces. */
-    int connected_n;
-    /** Error code. */
-    int err;
-};
-
-static struct netif_ctrl netctrl;
-
-static void netctrl_init(void)
-{
-    memset(&netctrl, 0, sizeof(netctrl));
-    netctrl.interface_n = -1;
-}
-
-/** Get or set a network interface error.
- */
-static int netctrl_err(int err)
-{
-    if(err < 0 && !netctrl.err){
-        netctrl.err = err;
-        printk(KERN_WARNING "%s> err=%d\n", __FUNCTION__, err);
-    }
-    return netctrl.err;
-}
-
-/** Test if all network interfaces are connected.
- *
- * @return 1 if all connected, 0 if not, negative error code otherwise
- */
-static int netctrl_connected(void)
-{
-    int ok = 0;
-    ok = (netctrl.err ? netctrl.err :
-          (netctrl.connected_n == netctrl.interface_n));
-    return ok;
-}
-
-/** Count the connected network interfaces.
- *
- * @return connected count
- */
-static int netctrl_connected_count(void)
-{
-    
-    struct list_head *ent;
-    struct net_private *np;
-    unsigned int connected;
-
-    connected = 0;
-    
-    list_for_each(ent, &dev_list)
-    {
-        np = list_entry(ent, struct net_private, list);
-        if ( np->backend_state == BEST_CONNECTED )
-            connected++;
-    }
-
-    netctrl.connected_n = connected;
-    return connected;
-}
-
-static int network_open(struct net_device *dev)
-{
-    struct net_private *np = dev->priv;
-
-    memset(&np->stats, 0, sizeof(np->stats));
-
-    np->user_state = UST_OPEN;
-
-    network_alloc_rx_buffers(dev);
-    np->rx->event = np->rx_resp_cons + 1;
-
-    netif_start_queue(dev);
-
-    return 0;
-}
-
-
-static void network_tx_buf_gc(struct net_device *dev)
-{
-    NETIF_RING_IDX i, prod;
-    unsigned short id;
-    struct net_private *np = dev->priv;
-    struct sk_buff *skb;
-
-    if ( np->backend_state != BEST_CONNECTED )
-        return;
-
-    do {
-        prod = np->tx->resp_prod;
-
-        for ( i = np->tx_resp_cons; i != prod; i++ )
-        {
-            id  = np->tx->ring[MASK_NETIF_TX_IDX(i)].resp.id;
-            skb = np->tx_skbs[id];
-            ADD_ID_TO_FREELIST(np->tx_skbs, id);
-            dev_kfree_skb_any(skb);
-        }
-        
-        np->tx_resp_cons = prod;
-        
-        /*
-         * Set a new event, then check for race with update of tx_cons. Note
-         * that it is essential to schedule a callback, no matter how few
-         * buffers are pending. Even if there is space in the transmit ring,
-         * higher layers may be blocked because too much data is outstanding:
-         * in such cases notification from Xen is likely to be the only kick
-         * that we'll get.
-         */
-        np->tx->event = 
-            prod + ((np->tx->req_prod - prod) >> 1) + 1;
-        mb();
-    }
-    while ( prod != np->tx->resp_prod );
-
-    if ( np->tx_full && 
-         ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) )
-    {
-        np->tx_full = 0;
-        if ( np->user_state == UST_OPEN )
-            netif_wake_queue(dev);
-    }
-}
-
-
-static void network_alloc_rx_buffers(struct net_device *dev)
-{
-    unsigned short id;
-    struct net_private *np = dev->priv;
-    struct sk_buff *skb;
-    NETIF_RING_IDX i = np->rx->req_prod;
-    int nr_pfns = 0;
-
-    /* Make sure the batch is large enough to be worthwhile (1/2 ring). */
-    if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) || 
-         unlikely(np->backend_state != BEST_CONNECTED) )
-        return;
-
-    do {
-        skb = dev_alloc_skb(RX_BUF_SIZE);
-        if ( unlikely(skb == NULL) )
-            break;
-
-        skb->dev = dev;
-
-        if ( unlikely(((unsigned long)skb->head & (PAGE_SIZE-1)) != 0) )
-            panic("alloc_skb needs to provide us page-aligned buffers.");
-
-        id = GET_ID_FROM_FREELIST(np->rx_skbs);
-
-        np->rx_skbs[id] = skb;
-        
-        np->rx->ring[MASK_NETIF_RX_IDX(i)].req.id = id;
-        
-        rx_pfn_array[nr_pfns] = virt_to_machine(skb->head) >> PAGE_SHIFT;
-
-        rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
-        rx_mcl[nr_pfns].args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
-        rx_mcl[nr_pfns].args[1] = 0;
-        rx_mcl[nr_pfns].args[2] = 0;
-
-        nr_pfns++;
-    }
-    while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE );
-
-    if ( unlikely(nr_pfns == 0) )
-        return;
-
-    /*
-     * We may have allocated buffers which have entries outstanding in the page
-     * update queue -- make sure we flush those first!
-     */
-    flush_page_update_queue();
-
-    /* After all PTEs have been zapped we blow away stale TLB entries. */
-    rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB;
-
-    /* Give away a batch of pages. */
-    rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op;
-    rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation;
-    rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array;
-    rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns;
-
-    /* Zap PTEs and give away pages in one big multicall. */
-    (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1);
-
-    /* Check return status of HYPERVISOR_dom_mem_op(). */
-    if ( rx_mcl[nr_pfns].args[5] != nr_pfns )
-        panic("Unable to reduce memory reservation\n");
-
-    np->rx->req_prod = i;
-}
-
-
-static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-    unsigned short id;
-    struct net_private *np = (struct net_private *)dev->priv;
-    netif_tx_request_t *tx;
-    NETIF_RING_IDX i;
-
-    if ( unlikely(np->tx_full) )
-    {
-        printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name);
-        netif_stop_queue(dev);
-        return -ENOBUFS;
-    }
-
-    if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
-                  PAGE_SIZE) )
-    {
-        struct sk_buff *new_skb = dev_alloc_skb(RX_BUF_SIZE);
-        if ( unlikely(new_skb == NULL) )
-            return 1;
-        skb_put(new_skb, skb->len);
-        memcpy(new_skb->data, skb->data, skb->len);
-        dev_kfree_skb(skb);
-        skb = new_skb;
-    }
-    
-    spin_lock_irq(&np->tx_lock);
-
-    if ( np->backend_state != BEST_CONNECTED )
-    {
-        spin_unlock_irq(&np->tx_lock);
-        return 1;
-    }
-
-    i = np->tx->req_prod;
-
-    id = GET_ID_FROM_FREELIST(np->tx_skbs);
-    np->tx_skbs[id] = skb;
-
-    tx = &np->tx->ring[MASK_NETIF_TX_IDX(i)].req;
-
-    tx->id   = id;
-    tx->addr = virt_to_machine(skb->data);
-    tx->size = skb->len;
-
-    wmb();
-    np->tx->req_prod = i + 1;
-
-    network_tx_buf_gc(dev);
-
-    if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) )
-    {
-        np->tx_full = 1;
-        netif_stop_queue(dev);
-    }
-
-    spin_unlock_irq(&np->tx_lock);
-
-    np->stats.tx_bytes += skb->len;
-    np->stats.tx_packets++;
-
-    /* Only notify Xen if there are no outstanding responses. */
-    mb();
-    if ( np->tx->resp_prod == i )
-        notify_via_evtchn(np->evtchn);
-
-    return 0;
-}
-
-
-static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
-{
-    struct net_device *dev = dev_id;
-    struct net_private *np = dev->priv;
-    unsigned long flags;
-
-    spin_lock_irqsave(&np->tx_lock, flags);
-    network_tx_buf_gc(dev);
-    spin_unlock_irqrestore(&np->tx_lock, flags);
-
-    if ( (np->rx_resp_cons != np->rx->resp_prod) &&
-         (np->user_state == UST_OPEN) )
-        netif_rx_schedule(dev);
-
-    return IRQ_HANDLED;
-}
-
-
-static int netif_poll(struct net_device *dev, int *pbudget)
-{
-    struct net_private *np = dev->priv;
-    struct sk_buff *skb;
-    netif_rx_response_t *rx;
-    NETIF_RING_IDX i;
-    mmu_update_t *mmu = rx_mmu;
-    multicall_entry_t *mcl = rx_mcl;
-    int work_done, budget, more_to_do = 1;
-    struct sk_buff_head rxq;
-    unsigned long flags;
-
-    spin_lock(&np->rx_lock);
-
-    if ( np->backend_state != BEST_CONNECTED )
-    {
-        spin_unlock(&np->rx_lock);
-        return 0;
-    }
-
-    skb_queue_head_init(&rxq);
-
-    if ( (budget = *pbudget) > dev->quota )
-        budget = dev->quota;
-
-    for ( i = np->rx_resp_cons, work_done = 0; 
-          (i != np->rx->resp_prod) && (work_done < budget); 
-          i++, work_done++ )
-    {
-        rx = &np->rx->ring[MASK_NETIF_RX_IDX(i)].resp;
-
-        /*
-         * An error here is very odd. Usually indicates a backend bug,
-         * low-memory condition, or that we didn't have reservation headroom.
-         * Whatever - print an error and queue the id again straight away.
-         */
-        if ( unlikely(rx->status <= 0) )
-        {
-            /* Gate this error. We get a (valid) slew of them on suspend. */
-            if ( np->user_state == UST_OPEN )
-                printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status);
-            np->rx->ring[MASK_NETIF_RX_IDX(np->rx->req_prod)].req.id = rx->id;
-            wmb();
-            np->rx->req_prod++;
-            continue;
-        }
-
-        skb = np->rx_skbs[rx->id];
-        ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
-
-        skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK);
-        skb_put(skb, rx->status);
-
-        np->stats.rx_packets++;
-        np->stats.rx_bytes += rx->status;
-
-        /* Remap the page. */
-        mmu->ptr  = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE;
-        mmu->val  = __pa(skb->head) >> PAGE_SHIFT;
-        mmu++;
-        mcl->op = __HYPERVISOR_update_va_mapping;
-        mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
-        mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
-        mcl->args[2] = 0;
-        mcl++;
-
-        phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = 
-            rx->addr >> PAGE_SHIFT;
-
-        __skb_queue_tail(&rxq, skb);
-    }
-
-    /* Do all the remapping work, and M->P updates, in one big hypercall. */
-    if ( likely((mcl - rx_mcl) != 0) )
-    {
-        mcl->op = __HYPERVISOR_mmu_update;
-        mcl->args[0] = (unsigned long)rx_mmu;
-        mcl->args[1] = mmu - rx_mmu;
-        mcl->args[2] = 0;
-        mcl++;
-        (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
-    }
-
-    while ( (skb = __skb_dequeue(&rxq)) != NULL )
-    {
-        /* Set the shared-info area, which is hidden behind the real data. */
-        atomic_set(&(skb_shinfo(skb)->dataref), 1);
-        skb_shinfo(skb)->nr_frags = 0;
-        skb_shinfo(skb)->frag_list = NULL;
-
-        /* Ethernet-specific work. Delayed to here as it peeks the header. */
-        skb->protocol = eth_type_trans(skb, dev);
-
-        /* Pass it up. */
-        netif_rx(skb);
-        dev->last_rx = jiffies;
-    }
-
-    np->rx_resp_cons = i;
-
-    network_alloc_rx_buffers(dev);
-
-    *pbudget   -= work_done;
-    dev->quota -= work_done;
-
-    if ( work_done < budget )
-    {
-        local_irq_save(flags);
-
-        np->rx->event = i + 1;
-    
-        /* Deal with hypervisor racing our resetting of rx_event. */
-        mb();
-        if ( np->rx->resp_prod == i )
-        {
-            __netif_rx_complete(dev);
-            more_to_do = 0;
-        }
-
-        local_irq_restore(flags);
-    }
-
-    spin_unlock(&np->rx_lock);
-
-    return more_to_do;
-}
-
-
-static int network_close(struct net_device *dev)
-{
-    struct net_private *np = dev->priv;
-    np->user_state = UST_CLOSED;
-    netif_stop_queue(np->dev);
-    return 0;
-}
-
-
-static struct net_device_stats *network_get_stats(struct net_device *dev)
-{
-    struct net_private *np = (struct net_private *)dev->priv;
-    return &np->stats;
-}
-
-
-static void network_connect(struct net_device *dev,
-                            netif_fe_interface_status_changed_t *status)
-{
-    struct net_private *np;
-    int i, requeue_idx;
-    netif_tx_request_t *tx;
-
-    np = dev->priv;
-    spin_lock_irq(&np->rx_lock);
-    spin_lock(&np->tx_lock);
-
-    /* Recovery procedure: */
-
-    /* Step 1: Reinitialise variables. */
-    np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0;
-    np->rx->event = 1;
-
-    /* Step 2: Rebuild the RX and TX ring contents.
-     * NB. We could just free the queued TX packets now but we hope
-     * that sending them out might do some good.  We have to rebuild
-     * the RX ring because some of our pages are currently flipped out
-     * so we can't just free the RX skbs.
-     * NB2. Freelist index entries are always going to be less than
-     *  __PAGE_OFFSET, whereas pointers to skbs will always be equal or
-     * greater than __PAGE_OFFSET: we use this property to distinguish
-     * them.
-     */
-
-    /* Rebuild the TX buffer freelist and the TX ring itself.
-     * NB. This reorders packets.  We could keep more private state
-     * to avoid this but maybe it doesn't matter so much given the
-     * interface has been down.
-     */
-    for ( requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++ )
-    {
-            if ( (unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET )
-            {
-                struct sk_buff *skb = np->tx_skbs[i];
-                
-                tx = &np->tx->ring[requeue_idx++].req;
-                
-                tx->id   = i;
-                tx->addr = virt_to_machine(skb->data);
-                tx->size = skb->len;
-                
-                np->stats.tx_bytes += skb->len;
-                np->stats.tx_packets++;
-            }
-    }
-    wmb();
-    np->tx->req_prod = requeue_idx;
-
-    /* Rebuild the RX buffer freelist and the RX ring itself. */
-    for ( requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++ )
-        if ( (unsigned long)np->rx_skbs[i] >= __PAGE_OFFSET )
-            np->rx->ring[requeue_idx++].req.id = i;
-    wmb();                
-    np->rx->req_prod = requeue_idx;
-
-    /* Step 3: All public and private state should now be sane.  Get
-     * ready to start sending and receiving packets and give the driver
-     * domain a kick because we've probably just requeued some
-     * packets.
-     */
-    np->backend_state = BEST_CONNECTED;
-    notify_via_evtchn(status->evtchn);  
-    network_tx_buf_gc(dev);
-
-    if ( np->user_state == UST_OPEN )
-        netif_start_queue(dev);
-
-    spin_unlock(&np->tx_lock);
-    spin_unlock_irq(&np->rx_lock);
-}
-
-static void netif_status_change(netif_fe_interface_status_changed_t *status)
-{
-    ctrl_msg_t                   cmsg;
-    netif_fe_interface_connect_t up;
-    struct net_device *dev;
-    struct net_private *np;
-    
-    if ( netctrl.interface_n <= 0 )
-    {
-        printk(KERN_WARNING "Status change: no interfaces\n");
-        return;
-    }
-
-    dev = find_dev_by_handle(status->handle);
-    if(!dev){
-        printk(KERN_WARNING "Status change: invalid netif handle %u\n",
-               status->handle);
-         return;
-    }
-    np  = dev->priv;
-    
-    switch ( status->status )
-    {
-    case NETIF_INTERFACE_STATUS_DESTROYED:
-        printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n",
-               np->backend_state);
-        break;
-
-    case NETIF_INTERFACE_STATUS_DISCONNECTED:
-        if ( np->backend_state != BEST_CLOSED )
-        {
-            printk(KERN_WARNING "Unexpected netif-DISCONNECTED message"
-                   " in state %d\n", np->backend_state);
-           printk(KERN_INFO "Attempting to reconnect network interface\n");
-
-            /* Begin interface recovery.
-            *
-            * NB. Whilst we're recovering, we turn the carrier state off.  We
-            * take measures to ensure that this device isn't used for
-            * anything.  We also stop the queue for this device.  Various
-            * different approaches (e.g. continuing to buffer packets) have
-            * been tested but don't appear to improve the overall impact on
-             * TCP connections.
-            *
-             * TODO: (MAW) Change the Xend<->Guest protocol so that a recovery
-             * is initiated by a special "RESET" message - disconnect could
-             * just mean we're not allowed to use this interface any more.
-             */
-
-            /* Stop old i/f to prevent errors whilst we rebuild the state. */
-            spin_lock_irq(&np->tx_lock);
-            spin_lock(&np->rx_lock);
-            netif_stop_queue(dev);
-            np->backend_state = BEST_DISCONNECTED;
-            spin_unlock(&np->rx_lock);
-            spin_unlock_irq(&np->tx_lock);
-
-            /* Free resources. */
-            free_irq(np->irq, dev);
-            unbind_evtchn_from_irq(np->evtchn);
-           free_page((unsigned long)np->tx);
-            free_page((unsigned long)np->rx);
-        }
-
-        /* Move from CLOSED to DISCONNECTED state. */
-        np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
-        np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
-        memset(np->tx, 0, PAGE_SIZE);
-        memset(np->rx, 0, PAGE_SIZE);
-        np->backend_state = BEST_DISCONNECTED;
-
-        /* Construct an interface-CONNECT message for the domain controller. */
-        cmsg.type      = CMSG_NETIF_FE;
-        cmsg.subtype   = CMSG_NETIF_FE_INTERFACE_CONNECT;
-        cmsg.length    = sizeof(netif_fe_interface_connect_t);
-        up.handle      = status->handle;
-        up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT;
-        up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT;
-        memcpy(cmsg.msg, &up, sizeof(up));
-        
-        /* Tell the controller to bring up the interface. */
-        ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
-        break;
-
-    case NETIF_INTERFACE_STATUS_CONNECTED:
-        if ( np->backend_state == BEST_CLOSED )
-        {
-            printk(KERN_WARNING "Unexpected netif-CONNECTED message"
-                   " in state %d\n", np->backend_state);
-            break;
-        }
-
-        memcpy(dev->dev_addr, status->mac, ETH_ALEN);
-
-        network_connect(dev, status);
-
-        np->evtchn = status->evtchn;
-        np->irq = bind_evtchn_to_irq(np->evtchn);
-        (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, 
-                          dev->name, dev);
-        
-        netctrl_connected_count();
-        break;
-
-    default:
-        printk(KERN_WARNING "Status change to unknown value %d\n", 
-               status->status);
-        break;
-    }
-}
-
-/** Create a network device.
- * @param handle device handle
- * @param val return parameter for created device
- * @return 0 on success, error code otherwise
- */
-static int create_netdev(int handle, struct net_device **val)
-{
-    int i, err = 0;
-    struct net_device *dev = NULL;
-    struct net_private *np = NULL;
-
-    if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL )
-    {
-        printk(KERN_WARNING "%s> alloc_etherdev failed.\n", __FUNCTION__);
-        err = -ENOMEM;
-        goto exit;
-    }
-
-    np                = dev->priv;
-    np->backend_state = BEST_CLOSED;
-    np->user_state    = UST_CLOSED;
-    np->handle        = handle;
-    
-    spin_lock_init(&np->tx_lock);
-    spin_lock_init(&np->rx_lock);
-
-    /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
-    for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ )
-        np->tx_skbs[i] = (void *)(i+1);
-    for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ )
-        np->rx_skbs[i] = (void *)(i+1);
-
-    dev->open            = network_open;
-    dev->hard_start_xmit = network_start_xmit;
-    dev->stop            = network_close;
-    dev->get_stats       = network_get_stats;
-    dev->poll            = netif_poll;
-    dev->weight          = 64;
-    
-    if ( (err = register_netdev(dev)) != 0 )
-    {
-        printk(KERN_WARNING "%s> register_netdev err=%d\n", __FUNCTION__, err);
-        goto exit;
-    }
-    np->dev = dev;
-    list_add(&np->list, &dev_list);
-
-  exit:
-    if ( (err != 0) && (dev != NULL ) )
-        kfree(dev);
-    else if ( val != NULL )
-        *val = dev;
-    return err;
-}
-
-/*
- * Initialize the network control interface. Set the number of network devices
- * and create them.
- */
-static void netif_driver_status_change(
-    netif_fe_driver_status_changed_t *status)
-{
-    int err = 0;
-    int i;
-    
-    netctrl.interface_n = status->nr_interfaces;
-    netctrl.connected_n = 0;
-
-    for ( i = 0; i < netctrl.interface_n; i++ )
-    {
-        if ( (err = create_netdev(i, NULL)) != 0 )
-        {
-            netctrl_err(err);
-            break;
-        }
-    }
-}
-
-static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
-{
-    int respond = 1;
-
-    switch ( msg->subtype )
-    {
-    case CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED:
-        if ( msg->length != sizeof(netif_fe_interface_status_changed_t) )
-            goto error;
-        netif_status_change((netif_fe_interface_status_changed_t *)
-                            &msg->msg[0]);
-        break;
-
-    case CMSG_NETIF_FE_DRIVER_STATUS_CHANGED:
-        if ( msg->length != sizeof(netif_fe_driver_status_changed_t) )
-            goto error;
-        netif_driver_status_change((netif_fe_driver_status_changed_t *)
-                                   &msg->msg[0]);
-        /* Message is a response */
-        respond = 0;
-        break;
-
-    error:
-    default:
-        msg->length = 0;
-        break;
-    }
-
-    if ( respond )
-        ctrl_if_send_response(msg);
-}
-
-
-static int __init netif_init(void)
-{
-    ctrl_msg_t                       cmsg;
-    netif_fe_driver_status_changed_t st;
-    int err = 0, wait_i, wait_n = 20;
-
-    if ( (start_info.flags & SIF_INITDOMAIN) ||
-         (start_info.flags & SIF_NET_BE_DOMAIN) )
-        return 0;
-
-    printk("Initialising Xen virtual ethernet frontend driver");
-
-    INIT_LIST_HEAD(&dev_list);
-
-    netctrl_init();
-
-    (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx,
-                                    CALLBACK_IN_BLOCKING_CONTEXT);
-
-    /* Send a driver-UP notification to the domain controller. */
-    cmsg.type      = CMSG_NETIF_FE;
-    cmsg.subtype   = CMSG_NETIF_FE_DRIVER_STATUS_CHANGED;
-    cmsg.length    = sizeof(netif_fe_driver_status_changed_t);
-    st.status      = NETIF_DRIVER_STATUS_UP;
-    st.nr_interfaces = 0;
-    memcpy(cmsg.msg, &st, sizeof(st));
-    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
-
-    /* Wait for all interfaces to be connected. */
-    for ( wait_i = 0; ; wait_i++)
-    {
-        if ( (err = (wait_i < wait_n) ? netctrl_connected() : -ENETDOWN) != 0 )
-        {
-            err = (err > 0) ? 0 : err;
-            break;
-        }
-        set_current_state(TASK_INTERRUPTIBLE);
-        schedule_timeout(1);
-     }
-
-    if ( err )
-        ctrl_if_unregister_receiver(CMSG_NETIF_FE, netif_ctrlif_rx);
-
-    return err;
-}
-
-__initcall(netif_init);
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile b/linux-2.6.7-xen-sparse/drivers/xen/netback/Makefile
new file mode 100644 (file)
index 0000000..3279442
--- /dev/null
@@ -0,0 +1,2 @@
+
+obj-y  := netback.o control.o interface.o
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netback/common.h b/linux-2.6.7-xen-sparse/drivers/xen/netback/common.h
new file mode 100644 (file)
index 0000000..b977dff
--- /dev/null
@@ -0,0 +1,97 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/common.h
+ */
+
+#ifndef __NETIF__BACKEND__COMMON_H__
+#define __NETIF__BACKEND__COMMON_H__
+
+#include <linux/config.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <asm-xen/ctrl_if.h>
+#include <asm/io.h>
+#include <asm/pgalloc.h>
+#include <asm-xen/netif.h>
+
+#if 0
+#define ASSERT(_p) \
+    if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
+                           __FILE__ , __LINE__ , ## _a )
+#else
+#define ASSERT(_p) ((void)0)
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+typedef struct netif_st {
+    /* Unique identifier for this interface. */
+    domid_t          domid;
+    unsigned int     handle;
+
+    /* Physical parameters of the comms window. */
+    unsigned long    tx_shmem_frame;
+    unsigned long    rx_shmem_frame;
+    unsigned int     evtchn;
+    int              irq;
+
+    /* The shared rings and indexes. */
+    netif_tx_interface_t *tx;
+    netif_rx_interface_t *rx;
+
+    /* Private indexes into shared ring. */
+    NETIF_RING_IDX rx_req_cons;
+    NETIF_RING_IDX rx_resp_prod; /* private version of shared variable */
+    NETIF_RING_IDX tx_req_cons;
+    NETIF_RING_IDX tx_resp_prod; /* private version of shared variable */
+
+    /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
+    unsigned long   credit_bytes;
+    unsigned long   credit_usec;
+    unsigned long   remaining_credit;
+    struct timer_list credit_timeout;
+
+    /* Miscellaneous private stuff. */
+    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+    /*
+     * DISCONNECT response is deferred until pending requests are ack'ed.
+     * We therefore need to store the id from the original request.
+     */
+    u8               disconnect_rspid;
+    struct netif_st *hash_next;
+    struct list_head list;  /* scheduling list */
+    atomic_t         refcnt;
+    spinlock_t       rx_lock, tx_lock;
+    struct net_device *dev;
+    struct net_device_stats stats;
+} netif_t;
+
+void netif_create(netif_be_create_t *create);
+void netif_destroy(netif_be_destroy_t *destroy);
+void netif_connect(netif_be_connect_t *connect);
+int  netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id);
+void __netif_disconnect_complete(netif_t *netif);
+netif_t *netif_find_by_handle(domid_t domid, unsigned int handle);
+#define netif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define netif_put(_b)                             \
+    do {                                          \
+        if ( atomic_dec_and_test(&(_b)->refcnt) ) \
+            __netif_disconnect_complete(_b);      \
+    } while (0)
+
+void netif_interface_init(void);
+void netif_ctrlif_init(void);
+
+void netif_deschedule(netif_t *netif);
+
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
+struct net_device_stats *netif_be_get_stats(struct net_device *dev);
+irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+
+#endif /* __NETIF__BACKEND__COMMON_H__ */
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netback/control.c b/linux-2.6.7-xen-sparse/drivers/xen/netback/control.c
new file mode 100644 (file)
index 0000000..cf1b075
--- /dev/null
@@ -0,0 +1,65 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/control.c
+ * 
+ * Routines for interfacing with the control plane.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+
+static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+    switch ( msg->subtype )
+    {
+    case CMSG_NETIF_BE_CREATE:
+        if ( msg->length != sizeof(netif_be_create_t) )
+            goto parse_error;
+        netif_create((netif_be_create_t *)&msg->msg[0]);
+        break;        
+    case CMSG_NETIF_BE_DESTROY:
+        if ( msg->length != sizeof(netif_be_destroy_t) )
+            goto parse_error;
+        netif_destroy((netif_be_destroy_t *)&msg->msg[0]);
+        break;        
+    case CMSG_NETIF_BE_CONNECT:
+        if ( msg->length != sizeof(netif_be_connect_t) )
+            goto parse_error;
+        netif_connect((netif_be_connect_t *)&msg->msg[0]);
+        break;        
+    case CMSG_NETIF_BE_DISCONNECT:
+        if ( msg->length != sizeof(netif_be_disconnect_t) )
+            goto parse_error;
+        if ( !netif_disconnect((netif_be_disconnect_t *)&msg->msg[0],msg->id) )
+            return; /* Sending the response is deferred until later. */
+        break;        
+    default:
+        goto parse_error;
+    }
+
+    ctrl_if_send_response(msg);
+    return;
+
+ parse_error:
+    DPRINTK("Parse error while reading message subtype %d, len %d\n",
+            msg->subtype, msg->length);
+    msg->length = 0;
+    ctrl_if_send_response(msg);
+}
+
+void netif_ctrlif_init(void)
+{
+    ctrl_msg_t                       cmsg;
+    netif_be_driver_status_changed_t st;
+
+    (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx,
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
+
+    /* Send a driver-UP notification to the domain controller. */
+    cmsg.type      = CMSG_NETIF_BE;
+    cmsg.subtype   = CMSG_NETIF_BE_DRIVER_STATUS_CHANGED;
+    cmsg.length    = sizeof(netif_be_driver_status_changed_t);
+    st.status      = NETIF_DRIVER_STATUS_UP;
+    memcpy(cmsg.msg, &st, sizeof(st));
+    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+}
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c b/linux-2.6.7-xen-sparse/drivers/xen/netback/interface.c
new file mode 100644 (file)
index 0000000..01447b1
--- /dev/null
@@ -0,0 +1,288 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/interface.c
+ * 
+ * Network-device interface management.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+#include <linux/rtnetlink.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#define VMALLOC_VMADDR(x) ((unsigned long)(x))
+#endif
+
+#define NETIF_HASHSZ 1024
+#define NETIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(NETIF_HASHSZ-1))
+
+static netif_t *netif_hash[NETIF_HASHSZ];
+
+netif_t *netif_find_by_handle(domid_t domid, unsigned int handle)
+{
+    netif_t *netif = netif_hash[NETIF_HASH(domid, handle)];
+    while ( (netif != NULL) && 
+            ((netif->domid != domid) || (netif->handle != handle)) )
+        netif = netif->hash_next;
+    return netif;
+}
+
+void __netif_disconnect_complete(netif_t *netif)
+{
+    ctrl_msg_t            cmsg;
+    netif_be_disconnect_t disc;
+
+    /*
+     * These can't be done in __netif_disconnect() because at that point there
+     * may be outstanding requests at the disc whose asynchronous responses
+     * must still be notified to the remote driver.
+     */
+    unbind_evtchn_from_irq(netif->evtchn);
+    vfree(netif->tx); /* Frees netif->rx as well. */
+    rtnl_lock();
+    (void)dev_close(netif->dev);
+    rtnl_unlock();
+
+    /* Construct the deferred response message. */
+    cmsg.type         = CMSG_NETIF_BE;
+    cmsg.subtype      = CMSG_NETIF_BE_DISCONNECT;
+    cmsg.id           = netif->disconnect_rspid;
+    cmsg.length       = sizeof(netif_be_disconnect_t);
+    disc.domid        = netif->domid;
+    disc.netif_handle = netif->handle;
+    disc.status       = NETIF_BE_STATUS_OKAY;
+    memcpy(cmsg.msg, &disc, sizeof(disc));
+
+    /*
+     * Make sure message is constructed /before/ status change, because
+     * after the status change the 'netif' structure could be deallocated at
+     * any time. Also make sure we send the response /after/ status change,
+     * as otherwise a subsequent CONNECT request could spuriously fail if
+     * another CPU doesn't see the status change yet.
+     */
+    mb();
+    if ( netif->status != DISCONNECTING )
+        BUG();
+    netif->status = DISCONNECTED;
+    mb();
+
+    /* Send the successful response. */
+    ctrl_if_send_response(&cmsg);
+}
+
+void netif_create(netif_be_create_t *create)
+{
+    int                err = 0;
+    domid_t            domid  = create->domid;
+    unsigned int       handle = create->netif_handle;
+    struct net_device *dev;
+    netif_t          **pnetif, *netif;
+    char               name[IFNAMSIZ] = {};
+
+    snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
+    dev = alloc_netdev(sizeof(netif_t), name, ether_setup);
+    if ( dev == NULL )
+    {
+        DPRINTK("Could not create netif: out of memory\n");
+        create->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    netif = dev->priv;
+    memset(netif, 0, sizeof(*netif));
+    netif->domid  = domid;
+    netif->handle = handle;
+    netif->status = DISCONNECTED;
+    spin_lock_init(&netif->rx_lock);
+    spin_lock_init(&netif->tx_lock);
+    atomic_set(&netif->refcnt, 0);
+    netif->dev = dev;
+
+    netif->credit_bytes = netif->remaining_credit = ~0UL;
+    netif->credit_usec  = 0UL;
+    /*init_ac_timer(&new_vif->credit_timeout);*/
+
+    pnetif = &netif_hash[NETIF_HASH(domid, handle)];
+    while ( *pnetif != NULL )
+    {
+        if ( ((*pnetif)->domid == domid) && ((*pnetif)->handle == handle) )
+        {
+            DPRINTK("Could not create netif: already exists\n");
+            create->status = NETIF_BE_STATUS_INTERFACE_EXISTS;
+            kfree(dev);
+            return;
+        }
+        pnetif = &(*pnetif)->hash_next;
+    }
+
+    dev->hard_start_xmit = netif_be_start_xmit;
+    dev->get_stats       = netif_be_get_stats;
+    memcpy(dev->dev_addr, create->mac, ETH_ALEN);
+
+    /* Disable queuing. */
+    dev->tx_queue_len = 0;
+
+    /* Force a different MAC from remote end. */
+    dev->dev_addr[2] ^= 1;
+
+    if ( (err = register_netdev(dev)) != 0 )
+    {
+        DPRINTK("Could not register new net device %s: err=%d\n",
+                dev->name, err);
+        create->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+        kfree(dev);
+        return;
+    }
+
+    netif->hash_next = *pnetif;
+    *pnetif = netif;
+
+    DPRINTK("Successfully created netif\n");
+    create->status = NETIF_BE_STATUS_OKAY;
+}
+
+void netif_destroy(netif_be_destroy_t *destroy)
+{
+    domid_t       domid  = destroy->domid;
+    unsigned int  handle = destroy->netif_handle;
+    netif_t     **pnetif, *netif;
+
+    pnetif = &netif_hash[NETIF_HASH(domid, handle)];
+    while ( (netif = *pnetif) != NULL )
+    {
+        if ( (netif->domid == domid) && (netif->handle == handle) )
+        {
+            if ( netif->status != DISCONNECTED )
+                goto still_connected;
+            goto destroy;
+        }
+        pnetif = &netif->hash_next;
+    }
+
+    destroy->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
+    return;
+
+ still_connected:
+    destroy->status = NETIF_BE_STATUS_INTERFACE_CONNECTED;
+    return;
+
+ destroy:
+    *pnetif = netif->hash_next;
+    unregister_netdev(netif->dev);
+    kfree(netif->dev);
+    destroy->status = NETIF_BE_STATUS_OKAY;
+}
+
+void netif_connect(netif_be_connect_t *connect)
+{
+    domid_t       domid  = connect->domid;
+    unsigned int  handle = connect->netif_handle;
+    unsigned int  evtchn = connect->evtchn;
+    unsigned long tx_shmem_frame = connect->tx_shmem_frame;
+    unsigned long rx_shmem_frame = connect->rx_shmem_frame;
+    struct vm_struct *vma;
+    pgprot_t      prot;
+    int           error;
+    netif_t      *netif;
+#if 0
+    struct net_device *eth0_dev;
+#endif
+
+    netif = netif_find_by_handle(domid, handle);
+    if ( unlikely(netif == NULL) )
+    {
+        DPRINTK("netif_connect attempted for non-existent netif (%u,%u)\n", 
+                connect->domid, connect->netif_handle); 
+        connect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return;
+    }
+
+    if ( netif->status != DISCONNECTED )
+    {
+        connect->status = NETIF_BE_STATUS_INTERFACE_CONNECTED;
+        return;
+    }
+
+    if ( (vma = get_vm_area(2*PAGE_SIZE, VM_IOREMAP)) == NULL )
+    {
+        connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
+    error  = direct_remap_area_pages(&init_mm, 
+                                     VMALLOC_VMADDR(vma->addr),
+                                     tx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
+                                     prot, domid);
+    error |= direct_remap_area_pages(&init_mm, 
+                                     VMALLOC_VMADDR(vma->addr) + PAGE_SIZE,
+                                     rx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
+                                     prot, domid);
+    if ( error != 0 )
+    {
+        if ( error == -ENOMEM )
+            connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY;
+        else if ( error == -EFAULT )
+            connect->status = NETIF_BE_STATUS_MAPPING_ERROR;
+        else
+            connect->status = NETIF_BE_STATUS_ERROR;
+        vfree(vma->addr);
+        return;
+    }
+
+    netif->evtchn         = evtchn;
+    netif->irq            = bind_evtchn_to_irq(evtchn);
+    netif->tx_shmem_frame = tx_shmem_frame;
+    netif->rx_shmem_frame = rx_shmem_frame;
+    netif->tx             = 
+        (netif_tx_interface_t *)vma->addr;
+    netif->rx             = 
+        (netif_rx_interface_t *)((char *)vma->addr + PAGE_SIZE);
+    netif->status         = CONNECTED;
+    netif_get(netif);
+
+    rtnl_lock();
+    (void)dev_open(netif->dev);
+    rtnl_unlock();
+
+    (void)request_irq(netif->irq, netif_be_int, 0, netif->dev->name, netif);
+    netif_start_queue(netif->dev);
+
+    connect->status = NETIF_BE_STATUS_OKAY;
+}
+
+int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id)
+{
+    domid_t       domid  = disconnect->domid;
+    unsigned int  handle = disconnect->netif_handle;
+    netif_t      *netif;
+
+    netif = netif_find_by_handle(domid, handle);
+    if ( unlikely(netif == NULL) )
+    {
+        DPRINTK("netif_disconnect attempted for non-existent netif"
+                " (%u,%u)\n", disconnect->domid, disconnect->netif_handle); 
+        disconnect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND;
+        return 1; /* Caller will send response error message. */
+    }
+
+    if ( netif->status == CONNECTED )
+    {
+        netif->status = DISCONNECTING;
+        netif->disconnect_rspid = rsp_id;
+        wmb(); /* Let other CPUs see the status change. */
+        netif_stop_queue(netif->dev);
+        free_irq(netif->irq, netif);
+        netif_deschedule(netif);
+        netif_put(netif);
+        return 0; /* Caller should not send response message. */
+    }
+
+    disconnect->status = NETIF_BE_STATUS_OKAY;
+    return 1;
+}
+
+void netif_interface_init(void)
+{
+    memset(netif_hash, 0, sizeof(netif_hash));
+}
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c b/linux-2.6.7-xen-sparse/drivers/xen/netback/netback.c
new file mode 100644 (file)
index 0000000..c038104
--- /dev/null
@@ -0,0 +1,778 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/main.c
+ * 
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A 
+ * reference front-end implementation can be found in:
+ *  arch/xen/drivers/netif/frontend
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ */
+
+#include "common.h"
+
+static void netif_page_release(struct page *page);
+static void make_tx_response(netif_t *netif, 
+                             u16      id,
+                             s8       st);
+static int  make_rx_response(netif_t *netif, 
+                             u16      id, 
+                             s8       st,
+                             memory_t addr,
+                             u16      size);
+
+static void net_tx_action(unsigned long unused);
+static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
+
+static void net_rx_action(unsigned long unused);
+static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0);
+
+static struct sk_buff_head rx_queue;
+static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE*2];
+static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE*3];
+static unsigned char rx_notify[NR_EVENT_CHANNELS];
+
+/* Don't currently gate addition of an interface to the tx scheduling list. */
+#define tx_work_exists(_if) (1)
+
+#define MAX_PENDING_REQS 256
+static unsigned long mmap_vstart;
+#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE))
+
+#define PKT_PROT_LEN (ETH_HLEN + 20)
+
+static struct {
+    netif_tx_request_t req;
+    netif_t *netif;
+} pending_tx_info[MAX_PENDING_REQS];
+static u16 pending_ring[MAX_PENDING_REQS];
+typedef unsigned int PEND_RING_IDX;
+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
+static PEND_RING_IDX pending_prod, pending_cons;
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+/* Freed TX SKBs get batched on this ring before return to pending_ring. */
+static u16 dealloc_ring[MAX_PENDING_REQS];
+static spinlock_t dealloc_lock = SPIN_LOCK_UNLOCKED;
+static PEND_RING_IDX dealloc_prod, dealloc_cons;
+
+static struct sk_buff_head tx_queue;
+static multicall_entry_t tx_mcl[MAX_PENDING_REQS];
+
+static struct list_head net_schedule_list;
+static spinlock_t net_schedule_list_lock;
+
+#define MAX_MFN_ALLOC 64
+static unsigned long mfn_list[MAX_MFN_ALLOC];
+static unsigned int alloc_index = 0;
+static spinlock_t mfn_lock = SPIN_LOCK_UNLOCKED;
+
+static void __refresh_mfn_list(void)
+{
+    int ret = HYPERVISOR_dom_mem_op(MEMOP_increase_reservation,
+                                    mfn_list, MAX_MFN_ALLOC);
+    if ( unlikely(ret != MAX_MFN_ALLOC) )
+        BUG();
+    alloc_index = MAX_MFN_ALLOC;
+}
+
+static unsigned long get_new_mfn(void)
+{
+    unsigned long mfn, flags;
+    spin_lock_irqsave(&mfn_lock, flags);
+    if ( alloc_index == 0 )
+        __refresh_mfn_list();
+    mfn = mfn_list[--alloc_index];
+    spin_unlock_irqrestore(&mfn_lock, flags);
+    return mfn;
+}
+
+static void dealloc_mfn(unsigned long mfn)
+{
+    unsigned long flags;
+    spin_lock_irqsave(&mfn_lock, flags);
+    if ( alloc_index != MAX_MFN_ALLOC )
+        mfn_list[alloc_index++] = mfn;
+    else if ( HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, &mfn, 1) != 1 )
+        BUG();
+    spin_unlock_irqrestore(&mfn_lock, flags);
+}
+
+static inline void maybe_schedule_tx_action(void)
+{
+    smp_mb();
+    if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
+         !list_empty(&net_schedule_list) )
+        tasklet_schedule(&net_tx_tasklet);
+}
+
+/*
+ * This is the primary RECEIVE function for a network interface.
+ * Note that, from the p.o.v. of /this/ OS it looks like a transmit.
+ */
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+    netif_t *netif = (netif_t *)dev->priv;
+
+    ASSERT(skb->dev == dev);
+
+    /* Drop the packet if the target domain has no receive buffers. */
+    if ( (netif->rx_req_cons == netif->rx->req_prod) ||
+         ((netif->rx_req_cons-netif->rx_resp_prod) == NETIF_RX_RING_SIZE) )
+        goto drop;
+
+    /*
+     * We do not copy the packet unless:
+     *  1. The data is shared; or
+     *  2. It spans a page boundary; or
+     *  3. We cannot be sure the whole data page is allocated.
+     * The copying method is taken from skb_copy().
+     * NB. We also couldn't cope with fragmented packets, but we won't get
+     *     any because we not advertise the NETIF_F_SG feature.
+     */
+    if ( skb_shared(skb) || skb_cloned(skb) || 
+         (((unsigned long)skb->end ^ (unsigned long)skb->head) & PAGE_MASK) ||
+         ((skb->end - skb->head) < (PAGE_SIZE/2)) )
+    {
+        struct sk_buff *nskb = alloc_skb(PAGE_SIZE-1024, GFP_ATOMIC);
+        int hlen = skb->data - skb->head;
+        if ( unlikely(nskb == NULL) )
+            goto drop;
+        skb_reserve(nskb, hlen);
+        __skb_put(nskb, skb->len);
+        (void)skb_copy_bits(skb, -hlen, nskb->head, hlen + skb->len);
+        nskb->dev = skb->dev;
+        dev_kfree_skb(skb);
+        skb = nskb;
+    }
+
+    netif->rx_req_cons++;
+
+    skb_queue_tail(&rx_queue, skb);
+    tasklet_schedule(&net_rx_tasklet);
+
+    return 0;
+
+ drop:
+    netif->stats.rx_dropped++;
+    dev_kfree_skb(skb);
+    return 0;
+}
+
+#if 0
+static void xen_network_done_notify(void)
+{
+    static struct net_device *eth0_dev = NULL;
+    if ( unlikely(eth0_dev == NULL) )
+        eth0_dev = __dev_get_by_name("eth0");
+    netif_rx_schedule(eth0_dev);
+}
+/* 
+ * Add following to poll() function in NAPI driver (Tigon3 is example):
+ *  if ( xen_network_done() )
+ *      tg3_enable_ints(tp); 
+ */
+int xen_network_done(void)
+{
+    return skb_queue_empty(&rx_queue);
+}
+#endif
+
+static void net_rx_action(unsigned long unused)
+{
+    netif_t *netif;
+    s8 status;
+    u16 size, id, evtchn;
+    mmu_update_t *mmu;
+    multicall_entry_t *mcl;
+    unsigned long vdata, mdata, new_mfn;
+    struct sk_buff_head rxq;
+    struct sk_buff *skb;
+    u16 notify_list[NETIF_RX_RING_SIZE];
+    int notify_nr = 0;
+
+    skb_queue_head_init(&rxq);
+
+    mcl = rx_mcl;
+    mmu = rx_mmu;
+    while ( (skb = skb_dequeue(&rx_queue)) != NULL )
+    {
+        netif   = (netif_t *)skb->dev->priv;
+        vdata   = (unsigned long)skb->data;
+        mdata   = virt_to_machine(vdata);
+        new_mfn = get_new_mfn();
+        
+        mmu[0].ptr  = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+        mmu[0].val  = __pa(vdata) >> PAGE_SHIFT;        
+        mmu[1].val  = (unsigned long)(netif->domid<<16) & ~0xFFFFUL;
+        mmu[1].ptr  = (unsigned long)(netif->domid<< 0) & ~0xFFFFUL;
+        mmu[1].ptr |= MMU_EXTENDED_COMMAND;
+        mmu[1].val |= MMUEXT_SET_SUBJECTDOM;
+        mmu[2].ptr  = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND;
+        mmu[2].val  = MMUEXT_REASSIGN_PAGE;
+
+        mcl[0].op = __HYPERVISOR_update_va_mapping;
+        mcl[0].args[0] = vdata >> PAGE_SHIFT;
+        mcl[0].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL;
+        mcl[0].args[2] = 0;
+        mcl[1].op = __HYPERVISOR_mmu_update;
+        mcl[1].args[0] = (unsigned long)mmu;
+        mcl[1].args[1] = 3;
+        mcl[1].args[2] = 0;
+
+        mcl += 2;
+        mmu += 3;
+
+        __skb_queue_tail(&rxq, skb);
+
+        /* Filled the batch queue? */
+        if ( (mcl - rx_mcl) == ARRAY_SIZE(rx_mcl) )
+            break;
+    }
+
+    if ( mcl == rx_mcl )
+        return;
+
+    mcl[-2].args[2] = UVMF_FLUSH_TLB;
+    if ( unlikely(HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl) != 0) )
+        BUG();
+
+    mcl = rx_mcl;
+    mmu = rx_mmu;
+    while ( (skb = __skb_dequeue(&rxq)) != NULL )
+    {
+        netif   = (netif_t *)skb->dev->priv;
+        size    = skb->tail - skb->data;
+
+        /* Rederive the machine addresses. */
+        new_mfn = mcl[0].args[1] >> PAGE_SHIFT;
+        mdata   = ((mmu[2].ptr & PAGE_MASK) |
+                   ((unsigned long)skb->data & ~PAGE_MASK));
+        
+        phys_to_machine_mapping[__pa(skb->data) >> PAGE_SHIFT] = new_mfn;
+        
+        atomic_set(&(skb_shinfo(skb)->dataref), 1);
+        skb_shinfo(skb)->nr_frags = 0;
+        skb_shinfo(skb)->frag_list = NULL;
+
+        netif->stats.rx_bytes += size;
+        netif->stats.rx_packets++;
+
+        /* The update_va_mapping() must not fail. */
+        if ( unlikely(mcl[0].args[5] != 0) )
+            BUG();
+
+        /* Check the reassignment error code. */
+        status = NETIF_RSP_OKAY;
+        if ( unlikely(mcl[1].args[5] != 0) )
+        {
+            DPRINTK("Failed MMU update transferring to DOM%u\n", netif->domid);
+            dealloc_mfn(mdata >> PAGE_SHIFT);
+            status = NETIF_RSP_ERROR;
+        }
+
+        evtchn = netif->evtchn;
+        id = netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_resp_prod)].req.id;
+        if ( make_rx_response(netif, id, status, mdata, size) &&
+             (rx_notify[evtchn] == 0) )
+        {
+            rx_notify[evtchn] = 1;
+            notify_list[notify_nr++] = evtchn;
+        }
+
+        dev_kfree_skb(skb);
+
+        mcl += 2;
+        mmu += 3;
+    }
+
+    while ( notify_nr != 0 )
+    {
+        evtchn = notify_list[--notify_nr];
+        rx_notify[evtchn] = 0;
+        notify_via_evtchn(evtchn);
+    }
+
+    /* More work to do? */
+    if ( !skb_queue_empty(&rx_queue) )
+        tasklet_schedule(&net_rx_tasklet);
+#if 0
+    else
+        xen_network_done_notify();
+#endif
+}
+
+struct net_device_stats *netif_be_get_stats(struct net_device *dev)
+{
+    netif_t *netif = dev->priv;
+    return &netif->stats;
+}
+
+static int __on_net_schedule_list(netif_t *netif)
+{
+    return netif->list.next != NULL;
+}
+
+static void remove_from_net_schedule_list(netif_t *netif)
+{
+    spin_lock_irq(&net_schedule_list_lock);
+    if ( likely(__on_net_schedule_list(netif)) )
+    {
+        list_del(&netif->list);
+        netif->list.next = NULL;
+        netif_put(netif);
+    }
+    spin_unlock_irq(&net_schedule_list_lock);
+}
+
+static void add_to_net_schedule_list_tail(netif_t *netif)
+{
+    if ( __on_net_schedule_list(netif) )
+        return;
+
+    spin_lock_irq(&net_schedule_list_lock);
+    if ( !__on_net_schedule_list(netif) && (netif->status == CONNECTED) )
+    {
+        list_add_tail(&netif->list, &net_schedule_list);
+        netif_get(netif);
+    }
+    spin_unlock_irq(&net_schedule_list_lock);
+}
+
+static inline void netif_schedule_work(netif_t *netif)
+{
+    if ( (netif->tx_req_cons != netif->tx->req_prod) &&
+         ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) )
+    {
+        add_to_net_schedule_list_tail(netif);
+        maybe_schedule_tx_action();
+    }
+}
+
+void netif_deschedule(netif_t *netif)
+{
+    remove_from_net_schedule_list(netif);
+}
+
+#if 0
+static void tx_credit_callback(unsigned long data)
+{
+    netif_t *netif = (netif_t *)data;
+    netif->remaining_credit = netif->credit_bytes;
+    netif_schedule_work(netif);
+}
+#endif
+
+static void net_tx_action(unsigned long unused)
+{
+    struct list_head *ent;
+    struct sk_buff *skb;
+    netif_t *netif;
+    netif_tx_request_t txreq;
+    u16 pending_idx;
+    NETIF_RING_IDX i;
+    struct page *page;
+    multicall_entry_t *mcl;
+    PEND_RING_IDX dc, dp;
+
+    if ( (dc = dealloc_cons) == (dp = dealloc_prod) )
+        goto skip_dealloc;
+
+    mcl = tx_mcl;
+    while ( dc != dp )
+    {
+        pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
+        mcl[0].op = __HYPERVISOR_update_va_mapping;
+        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
+        mcl[0].args[1] = 0;
+        mcl[0].args[2] = 0;
+        mcl++;        
+    }
+
+    mcl[-1].args[2] = UVMF_FLUSH_TLB;
+    if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) )
+        BUG();
+
+    mcl = tx_mcl;
+    while ( dealloc_cons != dp )
+    {
+        /* The update_va_mapping() must not fail. */
+        if ( unlikely(mcl[0].args[5] != 0) )
+            BUG();
+
+        pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
+
+        netif = pending_tx_info[pending_idx].netif;
+
+        spin_lock(&netif->tx_lock);
+        make_tx_response(netif, pending_tx_info[pending_idx].req.id, 
+                         NETIF_RSP_OKAY);
+        spin_unlock(&netif->tx_lock);
+        
+        pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+        
+        /*
+         * Scheduling checks must happen after the above response is posted.
+         * This avoids a possible race with a guest OS on another CPU.
+         */
+        mb();
+        if ( (netif->tx_req_cons != netif->tx->req_prod) &&
+             ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) )
+            add_to_net_schedule_list_tail(netif);
+        
+        netif_put(netif);
+
+        mcl++;
+    }
+
+ skip_dealloc:
+    mcl = tx_mcl;
+    while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
+            !list_empty(&net_schedule_list) )
+    {
+        /* Get a netif from the list with work to do. */
+        ent = net_schedule_list.next;
+        netif = list_entry(ent, netif_t, list);
+        netif_get(netif);
+        remove_from_net_schedule_list(netif);
+
+        /* Work to do? */
+        i = netif->tx_req_cons;
+        if ( (i == netif->tx->req_prod) ||
+             ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) )
+        {
+            netif_put(netif);
+            continue;
+        }
+        memcpy(&txreq, &netif->tx->ring[MASK_NETIF_TX_IDX(i)].req, 
+               sizeof(txreq));
+        netif->tx_req_cons++;
+
+#if 0
+        /* Credit-based scheduling. */
+        if ( tx.size > netif->remaining_credit )
+        {
+            s_time_t now = NOW(), next_credit = 
+                netif->credit_timeout.expires + MICROSECS(netif->credit_usec);
+            if ( next_credit <= now )
+            {
+                netif->credit_timeout.expires = now;
+                netif->remaining_credit = netif->credit_bytes;
+            }
+            else
+            {
+                netif->remaining_credit = 0;
+                netif->credit_timeout.expires  = next_credit;
+                netif->credit_timeout.data     = (unsigned long)netif;
+                netif->credit_timeout.function = tx_credit_callback;
+                netif->credit_timeout.cpu      = smp_processor_id();
+                add_ac_timer(&netif->credit_timeout);
+                break;
+            }
+        }
+        netif->remaining_credit -= tx.size;
+#endif
+
+        netif_schedule_work(netif);
+
+        if ( unlikely(txreq.size <= PKT_PROT_LEN) || 
+             unlikely(txreq.size > ETH_FRAME_LEN) )
+        {
+            DPRINTK("Bad packet size: %d\n", txreq.size);
+            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+            netif_put(netif);
+            continue; 
+        }
+
+        /* No crossing a page boundary as the payload mustn't fragment. */
+        if ( unlikely(((txreq.addr & ~PAGE_MASK) + txreq.size) >= PAGE_SIZE) ) 
+        {
+            DPRINTK("txreq.addr: %lx, size: %u, end: %lu\n", 
+                    txreq.addr, txreq.size, 
+                    (txreq.addr &~PAGE_MASK) + txreq.size);
+            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+            netif_put(netif);
+            continue;
+        }
+
+        pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
+
+        if ( unlikely((skb = alloc_skb(PKT_PROT_LEN+16, GFP_ATOMIC)) == NULL) )
+        {
+            DPRINTK("Can't allocate a skb in start_xmit.\n");
+            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+            netif_put(netif);
+            break;
+        }
+
+        /* Packets passed to netif_rx() must have some headroom. */
+        skb_reserve(skb, 16);
+
+        mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain;
+        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
+        mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL;
+        mcl[0].args[2] = 0;
+        mcl[0].args[3] = netif->domid;
+        mcl++;
+
+        memcpy(&pending_tx_info[pending_idx].req, &txreq, sizeof(txreq));
+        pending_tx_info[pending_idx].netif = netif;
+        *((u16 *)skb->data) = pending_idx;
+
+        __skb_queue_tail(&tx_queue, skb);
+
+        pending_cons++;
+
+        /* Filled the batch queue? */
+        if ( (mcl - tx_mcl) == ARRAY_SIZE(tx_mcl) )
+            break;
+    }
+
+    if ( mcl == tx_mcl )
+        return;
+
+    if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) )
+        BUG();
+
+    mcl = tx_mcl;
+    while ( (skb = __skb_dequeue(&tx_queue)) != NULL )
+    {
+        pending_idx = *((u16 *)skb->data);
+        netif       = pending_tx_info[pending_idx].netif;
+        memcpy(&txreq, &pending_tx_info[pending_idx].req, sizeof(txreq));
+
+        /* Check the remap error code. */
+        if ( unlikely(mcl[0].args[5] != 0) )
+        {
+            DPRINTK("Bad page frame\n");
+            make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
+            netif_put(netif);
+            kfree_skb(skb);
+            mcl++;
+            pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+            continue;
+        }
+
+        phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] =
+            txreq.addr >> PAGE_SHIFT;
+
+        __skb_put(skb, PKT_PROT_LEN);
+        memcpy(skb->data, 
+               (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)),
+               PKT_PROT_LEN);
+
+        page = virt_to_page(MMAP_VADDR(pending_idx));
+
+        /* Append the packet payload as a fragment. */
+        skb_shinfo(skb)->frags[0].page        = page;
+        skb_shinfo(skb)->frags[0].size        = txreq.size - PKT_PROT_LEN;
+        skb_shinfo(skb)->frags[0].page_offset = 
+            (txreq.addr + PKT_PROT_LEN) & ~PAGE_MASK;
+        skb_shinfo(skb)->nr_frags = 1;
+        skb->data_len  = txreq.size - PKT_PROT_LEN;
+        skb->len      += skb->data_len;
+
+        skb->dev      = netif->dev;
+        skb->protocol = eth_type_trans(skb, skb->dev);
+
+        /*
+         * Destructor information. We hideously abuse the 'mapping' pointer,
+         * which isn't otherwise used by us. The page deallocator is modified
+         * to interpret a non-NULL value as a destructor function to be called.
+         * This works okay because in all other cases the pointer must be NULL
+         * when the page is freed (normally Linux will explicitly bug out if
+         * it sees otherwise.
+         */
+        page->mapping = (struct address_space *)netif_page_release;
+        set_page_count(page, 1);
+
+        netif->stats.tx_bytes += txreq.size;
+        netif->stats.tx_packets++;
+
+        netif_rx(skb);
+        netif->dev->last_rx = jiffies;
+
+        mcl++;
+    }
+}
+
+static void netif_page_release(struct page *page)
+{
+    unsigned long flags;
+    u16 pending_idx = page - virt_to_page(mmap_vstart);
+
+    /* Stop the abuse. */
+    page->mapping = NULL;
+
+    spin_lock_irqsave(&dealloc_lock, flags);
+    dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx;
+    spin_unlock_irqrestore(&dealloc_lock, flags);
+
+    tasklet_schedule(&net_tx_tasklet);
+}
+
+#if 0
+long flush_bufs_for_netif(netif_t *netif)
+{
+    NETIF_RING_IDX i;
+
+    /* Return any outstanding receive buffers to the guest OS. */
+    spin_lock(&netif->rx_lock);
+    for ( i = netif->rx_req_cons; 
+          (i != netif->rx->req_prod) &&
+              ((i-netif->rx_resp_prod) != NETIF_RX_RING_SIZE);
+          i++ )
+    {
+        make_rx_response(netif,
+                         netif->rx->ring[MASK_NETIF_RX_IDX(i)].req.id,
+                         NETIF_RSP_DROPPED, 0, 0);
+    }
+    netif->rx_req_cons = i;
+    spin_unlock(&netif->rx_lock);
+
+    /*
+     * Flush pending transmit buffers. The guest may still have to wait for
+     * buffers that are queued at a physical NIC.
+     */
+    spin_lock(&netif->tx_lock);
+    for ( i = netif->tx_req_cons; 
+          (i != netif->tx->req_prod) &&
+              ((i-netif->tx_resp_prod) != NETIF_TX_RING_SIZE);
+          i++ )
+    {
+        make_tx_response(netif,
+                         netif->tx->ring[MASK_NETIF_TX_IDX(i)].req.id,
+                         NETIF_RSP_DROPPED);
+    }
+    netif->tx_req_cons = i;
+    spin_unlock(&netif->tx_lock);
+
+    return 0;
+}
+#endif
+
+irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+    netif_t *netif = dev_id;
+    if ( tx_work_exists(netif) )
+    {
+        add_to_net_schedule_list_tail(netif);
+        maybe_schedule_tx_action();
+    }
+    return IRQ_HANDLED;
+}
+
+static void make_tx_response(netif_t *netif, 
+                             u16      id,
+                             s8       st)
+{
+    NETIF_RING_IDX i = netif->tx_resp_prod;
+    netif_tx_response_t *resp;
+
+    resp = &netif->tx->ring[MASK_NETIF_TX_IDX(i)].resp;
+    resp->id     = id;
+    resp->status = st;
+    wmb();
+    netif->tx->resp_prod = netif->tx_resp_prod = ++i;
+
+    mb(); /* Update producer before checking event threshold. */
+    if ( i == netif->tx->event )
+        notify_via_evtchn(netif->evtchn);
+}
+
+static int make_rx_response(netif_t *netif, 
+                            u16      id, 
+                            s8       st,
+                            memory_t addr,
+                            u16      size)
+{
+    NETIF_RING_IDX i = netif->rx_resp_prod;
+    netif_rx_response_t *resp;
+
+    resp = &netif->rx->ring[MASK_NETIF_RX_IDX(i)].resp;
+    resp->addr   = addr;
+    resp->id     = id;
+    resp->status = (s16)size;
+    if ( st < 0 )
+        resp->status = (s16)st;
+    wmb();
+    netif->rx->resp_prod = netif->rx_resp_prod = ++i;
+
+    mb(); /* Update producer before checking event threshold. */
+    return (i == netif->rx->event);
+}
+
+static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
+{
+    struct list_head *ent;
+    netif_t *netif;
+    int i = 0;
+
+    printk(KERN_ALERT "netif_schedule_list:\n");
+    spin_lock_irq(&net_schedule_list_lock);
+
+    list_for_each ( ent, &net_schedule_list )
+    {
+        netif = list_entry(ent, netif_t, list);
+        printk(KERN_ALERT " %d: private(rx_req_cons=%08x rx_resp_prod=%08x\n",
+               i, netif->rx_req_cons, netif->rx_resp_prod);               
+        printk(KERN_ALERT "   tx_req_cons=%08x tx_resp_prod=%08x)\n",
+               netif->tx_req_cons, netif->tx_resp_prod);
+        printk(KERN_ALERT "   shared(rx_req_prod=%08x rx_resp_prod=%08x\n",
+               netif->rx->req_prod, netif->rx->resp_prod);
+        printk(KERN_ALERT "   rx_event=%08x tx_req_prod=%08x\n",
+               netif->rx->event, netif->tx->req_prod);
+        printk(KERN_ALERT "   tx_resp_prod=%08x, tx_event=%08x)\n",
+               netif->tx->resp_prod, netif->tx->event);
+        i++;
+    }
+
+    spin_unlock_irq(&net_schedule_list_lock);
+    printk(KERN_ALERT " ** End of netif_schedule_list **\n");
+
+    return IRQ_HANDLED;
+}
+
+static int __init netback_init(void)
+{
+    int i;
+
+    if ( !(start_info.flags & SIF_NET_BE_DOMAIN) &&
+        !(start_info.flags & SIF_INITDOMAIN) )
+        return 0;
+
+    printk("Initialising Xen netif backend\n");
+
+    skb_queue_head_init(&rx_queue);
+    skb_queue_head_init(&tx_queue);
+
+    netif_interface_init();
+
+    if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 )
+        BUG();
+
+    pending_cons = 0;
+    pending_prod = MAX_PENDING_REQS;
+    for ( i = 0; i < MAX_PENDING_REQS; i++ )
+        pending_ring[i] = i;
+
+    spin_lock_init(&net_schedule_list_lock);
+    INIT_LIST_HEAD(&net_schedule_list);
+
+    netif_ctrlif_init();
+
+    (void)request_irq(bind_virq_to_irq(VIRQ_DEBUG),
+                      netif_be_dbg, SA_SHIRQ, 
+                      "net-be-dbg", &netif_be_dbg);
+
+    return 0;
+}
+
+static void netback_cleanup(void)
+{
+    BUG();
+}
+
+module_init(netback_init);
+module_exit(netback_cleanup);
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netfront/Kconfig b/linux-2.6.7-xen-sparse/drivers/xen/netfront/Kconfig
new file mode 100644 (file)
index 0000000..334e6c3
--- /dev/null
@@ -0,0 +1,6 @@
+
+config XENNET
+       tristate "Xen network driver"
+       depends on NETDEVICES && ARCH_XEN
+       help
+         Network driver for Xen
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile b/linux-2.6.7-xen-sparse/drivers/xen/netfront/Makefile
new file mode 100644 (file)
index 0000000..7eb07a2
--- /dev/null
@@ -0,0 +1,2 @@
+
+obj-y  := netfront.o
diff --git a/linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c b/linux-2.6.7-xen-sparse/drivers/xen/netfront/netfront.c
new file mode 100644 (file)
index 0000000..9381595
--- /dev/null
@@ -0,0 +1,882 @@
+/******************************************************************************
+ * Virtual network driver for conversing with remote driver backends.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+#include <asm-xen/evtchn.h>
+#include <asm-xen/ctrl_if.h>
+
+#include <asm/page.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <asm-xen/netif.h>
+#else
+#include "../netif.h"
+#define irqreturn_t void
+#define IRQ_HANDLED
+#endif
+
+#define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */
+
+static void network_tx_buf_gc(struct net_device *dev);
+static void network_alloc_rx_buffers(struct net_device *dev);
+
+static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE];
+static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1];
+static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE];
+
+static struct list_head dev_list;
+
+struct net_private
+{
+    struct list_head list;
+    struct net_device *dev;
+
+    struct net_device_stats stats;
+    NETIF_RING_IDX rx_resp_cons, tx_resp_cons;
+    unsigned int tx_full;
+    
+    netif_tx_interface_t *tx;
+    netif_rx_interface_t *rx;
+
+    spinlock_t   tx_lock;
+    spinlock_t   rx_lock;
+
+    unsigned int handle;
+    unsigned int evtchn;
+    unsigned int irq;
+
+    /* What is the status of our connection to the remote backend? */
+#define BEST_CLOSED       0
+#define BEST_DISCONNECTED 1
+#define BEST_CONNECTED    2
+    unsigned int backend_state;
+
+    /* Is this interface open or closed (down or up)? */
+#define UST_CLOSED        0
+#define UST_OPEN          1
+    unsigned int user_state;
+
+    /*
+     * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
+     * array is an index into a chain of free entries.
+     */
+    struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1];
+    struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1];
+};
+
+/* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */
+#define ADD_ID_TO_FREELIST(_list, _id)             \
+    (_list)[(_id)] = (_list)[0];                   \
+    (_list)[0]     = (void *)(unsigned long)(_id);
+#define GET_ID_FROM_FREELIST(_list)                \
+ ({ unsigned long _id = (unsigned long)(_list)[0]; \
+    (_list)[0]  = (_list)[_id];                    \
+    (unsigned short)_id; })
+
+static struct net_device *find_dev_by_handle(unsigned int handle)
+{
+    struct list_head *ent;
+    struct net_private *np;
+    list_for_each ( ent, &dev_list )
+    {
+        np = list_entry(ent, struct net_private, list);
+        if ( np->handle == handle )
+            return np->dev;
+    }
+    return NULL;
+}
+
+/** Network interface info. */
+struct netif_ctrl {
+    /** Number of interfaces. */
+    int interface_n;
+    /** Number of connected interfaces. */
+    int connected_n;
+    /** Error code. */
+    int err;
+};
+
+static struct netif_ctrl netctrl;
+
+static void netctrl_init(void)
+{
+    memset(&netctrl, 0, sizeof(netctrl));
+    netctrl.interface_n = -1;
+}
+
+/** Get or set a network interface error.
+ */
+static int netctrl_err(int err)
+{
+    if(err < 0 && !netctrl.err){
+        netctrl.err = err;
+        printk(KERN_WARNING "%s> err=%d\n", __FUNCTION__, err);
+    }
+    return netctrl.err;
+}
+
+/** Test if all network interfaces are connected.
+ *
+ * @return 1 if all connected, 0 if not, negative error code otherwise
+ */
+static int netctrl_connected(void)
+{
+    int ok = 0;
+    ok = (netctrl.err ? netctrl.err :
+          (netctrl.connected_n == netctrl.interface_n));
+    return ok;
+}
+
+/** Count the connected network interfaces.
+ *
+ * @return connected count
+ */
+static int netctrl_connected_count(void)
+{
+    
+    struct list_head *ent;
+    struct net_private *np;
+    unsigned int connected;
+
+    connected = 0;
+    
+    list_for_each(ent, &dev_list)
+    {
+        np = list_entry(ent, struct net_private, list);
+        if ( np->backend_state == BEST_CONNECTED )
+            connected++;
+    }
+
+    netctrl.connected_n = connected;
+    return connected;
+}
+
+static int network_open(struct net_device *dev)
+{
+    struct net_private *np = dev->priv;
+
+    memset(&np->stats, 0, sizeof(np->stats));
+
+    np->user_state = UST_OPEN;
+
+    network_alloc_rx_buffers(dev);
+    np->rx->event = np->rx_resp_cons + 1;
+
+    netif_start_queue(dev);
+
+    return 0;
+}
+
+
+static void network_tx_buf_gc(struct net_device *dev)
+{
+    NETIF_RING_IDX i, prod;
+    unsigned short id;
+    struct net_private *np = dev->priv;
+    struct sk_buff *skb;
+
+    if ( np->backend_state != BEST_CONNECTED )
+        return;
+
+    do {
+        prod = np->tx->resp_prod;
+
+        for ( i = np->tx_resp_cons; i != prod; i++ )
+        {
+            id  = np->tx->ring[MASK_NETIF_TX_IDX(i)].resp.id;
+            skb = np->tx_skbs[id];
+            ADD_ID_TO_FREELIST(np->tx_skbs, id);
+            dev_kfree_skb_any(skb);
+        }
+        
+        np->tx_resp_cons = prod;
+        
+        /*
+         * Set a new event, then check for race with update of tx_cons. Note
+         * that it is essential to schedule a callback, no matter how few
+         * buffers are pending. Even if there is space in the transmit ring,
+         * higher layers may be blocked because too much data is outstanding:
+         * in such cases notification from Xen is likely to be the only kick
+         * that we'll get.
+         */
+        np->tx->event = 
+            prod + ((np->tx->req_prod - prod) >> 1) + 1;
+        mb();
+    }
+    while ( prod != np->tx->resp_prod );
+
+    if ( np->tx_full && 
+         ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) )
+    {
+        np->tx_full = 0;
+        if ( np->user_state == UST_OPEN )
+            netif_wake_queue(dev);
+    }
+}
+
+
+static void network_alloc_rx_buffers(struct net_device *dev)
+{
+    unsigned short id;
+    struct net_private *np = dev->priv;
+    struct sk_buff *skb;
+    NETIF_RING_IDX i = np->rx->req_prod;
+    int nr_pfns = 0;
+
+    /* Make sure the batch is large enough to be worthwhile (1/2 ring). */
+    if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) || 
+         unlikely(np->backend_state != BEST_CONNECTED) )
+        return;
+
+    do {
+        skb = dev_alloc_skb(RX_BUF_SIZE);
+        if ( unlikely(skb == NULL) )
+            break;
+
+        skb->dev = dev;
+
+        if ( unlikely(((unsigned long)skb->head & (PAGE_SIZE-1)) != 0) )
+            panic("alloc_skb needs to provide us page-aligned buffers.");
+
+        id = GET_ID_FROM_FREELIST(np->rx_skbs);
+
+        np->rx_skbs[id] = skb;
+        
+        np->rx->ring[MASK_NETIF_RX_IDX(i)].req.id = id;
+        
+        rx_pfn_array[nr_pfns] = virt_to_machine(skb->head) >> PAGE_SHIFT;
+
+        rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
+        rx_mcl[nr_pfns].args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
+        rx_mcl[nr_pfns].args[1] = 0;
+        rx_mcl[nr_pfns].args[2] = 0;
+
+        nr_pfns++;
+    }
+    while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE );
+
+    if ( unlikely(nr_pfns == 0) )
+        return;
+
+    /*
+     * We may have allocated buffers which have entries outstanding in the page
+     * update queue -- make sure we flush those first!
+     */
+    flush_page_update_queue();
+
+    /* After all PTEs have been zapped we blow away stale TLB entries. */
+    rx_mcl[nr_pfns-1].args[2] = UVMF_FLUSH_TLB;
+
+    /* Give away a batch of pages. */
+    rx_mcl[nr_pfns].op = __HYPERVISOR_dom_mem_op;
+    rx_mcl[nr_pfns].args[0] = MEMOP_decrease_reservation;
+    rx_mcl[nr_pfns].args[1] = (unsigned long)rx_pfn_array;
+    rx_mcl[nr_pfns].args[2] = (unsigned long)nr_pfns;
+
+    /* Zap PTEs and give away pages in one big multicall. */
+    (void)HYPERVISOR_multicall(rx_mcl, nr_pfns+1);
+
+    /* Check return status of HYPERVISOR_dom_mem_op(). */
+    if ( rx_mcl[nr_pfns].args[5] != nr_pfns )
+        panic("Unable to reduce memory reservation\n");
+
+    np->rx->req_prod = i;
+}
+
+
+static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+    unsigned short id;
+    struct net_private *np = (struct net_private *)dev->priv;
+    netif_tx_request_t *tx;
+    NETIF_RING_IDX i;
+
+    if ( unlikely(np->tx_full) )
+    {
+        printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name);
+        netif_stop_queue(dev);
+        return -ENOBUFS;
+    }
+
+    if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
+                  PAGE_SIZE) )
+    {
+        struct sk_buff *new_skb = dev_alloc_skb(RX_BUF_SIZE);
+        if ( unlikely(new_skb == NULL) )
+            return 1;
+        skb_put(new_skb, skb->len);
+        memcpy(new_skb->data, skb->data, skb->len);
+        dev_kfree_skb(skb);
+        skb = new_skb;
+    }
+    
+    spin_lock_irq(&np->tx_lock);
+
+    if ( np->backend_state != BEST_CONNECTED )
+    {
+        spin_unlock_irq(&np->tx_lock);
+        return 1;
+    }
+
+    i = np->tx->req_prod;
+
+    id = GET_ID_FROM_FREELIST(np->tx_skbs);
+    np->tx_skbs[id] = skb;
+
+    tx = &np->tx->ring[MASK_NETIF_TX_IDX(i)].req;
+
+    tx->id   = id;
+    tx->addr = virt_to_machine(skb->data);
+    tx->size = skb->len;
+
+    wmb();
+    np->tx->req_prod = i + 1;
+
+    network_tx_buf_gc(dev);
+
+    if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) )
+    {
+        np->tx_full = 1;
+        netif_stop_queue(dev);
+    }
+
+    spin_unlock_irq(&np->tx_lock);
+
+    np->stats.tx_bytes += skb->len;
+    np->stats.tx_packets++;
+
+    /* Only notify Xen if there are no outstanding responses. */
+    mb();
+    if ( np->tx->resp_prod == i )
+        notify_via_evtchn(np->evtchn);
+
+    return 0;
+}
+
+
+static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
+{
+    struct net_device *dev = dev_id;
+    struct net_private *np = dev->priv;
+    unsigned long flags;
+
+    spin_lock_irqsave(&np->tx_lock, flags);
+    network_tx_buf_gc(dev);
+    spin_unlock_irqrestore(&np->tx_lock, flags);
+
+    if ( (np->rx_resp_cons != np->rx->resp_prod) &&
+         (np->user_state == UST_OPEN) )
+        netif_rx_schedule(dev);
+
+    return IRQ_HANDLED;
+}
+
+
+static int netif_poll(struct net_device *dev, int *pbudget)
+{
+    struct net_private *np = dev->priv;
+    struct sk_buff *skb;
+    netif_rx_response_t *rx;
+    NETIF_RING_IDX i;
+    mmu_update_t *mmu = rx_mmu;
+    multicall_entry_t *mcl = rx_mcl;
+    int work_done, budget, more_to_do = 1;
+    struct sk_buff_head rxq;
+    unsigned long flags;
+
+    spin_lock(&np->rx_lock);
+
+    if ( np->backend_state != BEST_CONNECTED )
+    {
+        spin_unlock(&np->rx_lock);
+        return 0;
+    }
+
+    skb_queue_head_init(&rxq);
+
+    if ( (budget = *pbudget) > dev->quota )
+        budget = dev->quota;
+
+    for ( i = np->rx_resp_cons, work_done = 0; 
+          (i != np->rx->resp_prod) && (work_done < budget); 
+          i++, work_done++ )
+    {
+        rx = &np->rx->ring[MASK_NETIF_RX_IDX(i)].resp;
+
+        /*
+         * An error here is very odd. Usually indicates a backend bug,
+         * low-memory condition, or that we didn't have reservation headroom.
+         * Whatever - print an error and queue the id again straight away.
+         */
+        if ( unlikely(rx->status <= 0) )
+        {
+            /* Gate this error. We get a (valid) slew of them on suspend. */
+            if ( np->user_state == UST_OPEN )
+                printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status);
+            np->rx->ring[MASK_NETIF_RX_IDX(np->rx->req_prod)].req.id = rx->id;
+            wmb();
+            np->rx->req_prod++;
+            continue;
+        }
+
+        skb = np->rx_skbs[rx->id];
+        ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
+
+        skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK);
+        skb_put(skb, rx->status);
+
+        np->stats.rx_packets++;
+        np->stats.rx_bytes += rx->status;
+
+        /* Remap the page. */
+        mmu->ptr  = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE;
+        mmu->val  = __pa(skb->head) >> PAGE_SHIFT;
+        mmu++;
+        mcl->op = __HYPERVISOR_update_va_mapping;
+        mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
+        mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
+        mcl->args[2] = 0;
+        mcl++;
+
+        phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = 
+            rx->addr >> PAGE_SHIFT;
+
+        __skb_queue_tail(&rxq, skb);
+    }
+
+    /* Do all the remapping work, and M->P updates, in one big hypercall. */
+    if ( likely((mcl - rx_mcl) != 0) )
+    {
+        mcl->op = __HYPERVISOR_mmu_update;
+        mcl->args[0] = (unsigned long)rx_mmu;
+        mcl->args[1] = mmu - rx_mmu;
+        mcl->args[2] = 0;
+        mcl++;
+        (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
+    }
+
+    while ( (skb = __skb_dequeue(&rxq)) != NULL )
+    {
+        /* Set the shared-info area, which is hidden behind the real data. */
+        atomic_set(&(skb_shinfo(skb)->dataref), 1);
+        skb_shinfo(skb)->nr_frags = 0;
+        skb_shinfo(skb)->frag_list = NULL;
+
+        /* Ethernet-specific work. Delayed to here as it peeks the header. */
+        skb->protocol = eth_type_trans(skb, dev);
+
+        /* Pass it up. */
+        netif_rx(skb);
+        dev->last_rx = jiffies;
+    }
+
+    np->rx_resp_cons = i;
+
+    network_alloc_rx_buffers(dev);
+
+    *pbudget   -= work_done;
+    dev->quota -= work_done;
+
+    if ( work_done < budget )
+    {
+        local_irq_save(flags);
+
+        np->rx->event = i + 1;
+    
+        /* Deal with hypervisor racing our resetting of rx_event. */
+        mb();
+        if ( np->rx->resp_prod == i )
+        {
+            __netif_rx_complete(dev);
+            more_to_do = 0;
+        }
+
+        local_irq_restore(flags);
+    }
+
+    spin_unlock(&np->rx_lock);
+
+    return more_to_do;
+}
+
+
+static int network_close(struct net_device *dev)
+{
+    struct net_private *np = dev->priv;
+    np->user_state = UST_CLOSED;
+    netif_stop_queue(np->dev);
+    return 0;
+}
+
+
+static struct net_device_stats *network_get_stats(struct net_device *dev)
+{
+    struct net_private *np = (struct net_private *)dev->priv;
+    return &np->stats;
+}
+
+
+static void network_connect(struct net_device *dev,
+                            netif_fe_interface_status_changed_t *status)
+{
+    struct net_private *np;
+    int i, requeue_idx;
+    netif_tx_request_t *tx;
+
+    np = dev->priv;
+    spin_lock_irq(&np->rx_lock);
+    spin_lock(&np->tx_lock);
+
+    /* Recovery procedure: */
+
+    /* Step 1: Reinitialise variables. */
+    np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0;
+    np->rx->event = 1;
+
+    /* Step 2: Rebuild the RX and TX ring contents.
+     * NB. We could just free the queued TX packets now but we hope
+     * that sending them out might do some good.  We have to rebuild
+     * the RX ring because some of our pages are currently flipped out
+     * so we can't just free the RX skbs.
+     * NB2. Freelist index entries are always going to be less than
+     *  __PAGE_OFFSET, whereas pointers to skbs will always be equal or
+     * greater than __PAGE_OFFSET: we use this property to distinguish
+     * them.
+     */
+
+    /* Rebuild the TX buffer freelist and the TX ring itself.
+     * NB. This reorders packets.  We could keep more private state
+     * to avoid this but maybe it doesn't matter so much given the
+     * interface has been down.
+     */
+    for ( requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++ )
+    {
+            if ( (unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET )
+            {
+                struct sk_buff *skb = np->tx_skbs[i];
+                
+                tx = &np->tx->ring[requeue_idx++].req;
+                
+                tx->id   = i;
+                tx->addr = virt_to_machine(skb->data);
+                tx->size = skb->len;
+                
+                np->stats.tx_bytes += skb->len;
+                np->stats.tx_packets++;
+            }
+    }
+    wmb();
+    np->tx->req_prod = requeue_idx;
+
+    /* Rebuild the RX buffer freelist and the RX ring itself. */
+    for ( requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++ )
+        if ( (unsigned long)np->rx_skbs[i] >= __PAGE_OFFSET )
+            np->rx->ring[requeue_idx++].req.id = i;
+    wmb();                
+    np->rx->req_prod = requeue_idx;
+
+    /* Step 3: All public and private state should now be sane.  Get
+     * ready to start sending and receiving packets and give the driver
+     * domain a kick because we've probably just requeued some
+     * packets.
+     */
+    np->backend_state = BEST_CONNECTED;
+    notify_via_evtchn(status->evtchn);  
+    network_tx_buf_gc(dev);
+
+    if ( np->user_state == UST_OPEN )
+        netif_start_queue(dev);
+
+    spin_unlock(&np->tx_lock);
+    spin_unlock_irq(&np->rx_lock);
+}
+
+static void netif_status_change(netif_fe_interface_status_changed_t *status)
+{
+    ctrl_msg_t                   cmsg;
+    netif_fe_interface_connect_t up;
+    struct net_device *dev;
+    struct net_private *np;
+    
+    if ( netctrl.interface_n <= 0 )
+    {
+        printk(KERN_WARNING "Status change: no interfaces\n");
+        return;
+    }
+
+    dev = find_dev_by_handle(status->handle);
+    if(!dev){
+        printk(KERN_WARNING "Status change: invalid netif handle %u\n",
+               status->handle);
+         return;
+    }
+    np  = dev->priv;
+    
+    switch ( status->status )
+    {
+    case NETIF_INTERFACE_STATUS_DESTROYED:
+        printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n",
+               np->backend_state);
+        break;
+
+    case NETIF_INTERFACE_STATUS_DISCONNECTED:
+        if ( np->backend_state != BEST_CLOSED )
+        {
+            printk(KERN_WARNING "Unexpected netif-DISCONNECTED message"
+                   " in state %d\n", np->backend_state);
+           printk(KERN_INFO "Attempting to reconnect network interface\n");
+
+            /* Begin interface recovery.
+            *
+            * NB. Whilst we're recovering, we turn the carrier state off.  We
+            * take measures to ensure that this device isn't used for
+            * anything.  We also stop the queue for this device.  Various
+            * different approaches (e.g. continuing to buffer packets) have
+            * been tested but don't appear to improve the overall impact on
+             * TCP connections.
+            *
+             * TODO: (MAW) Change the Xend<->Guest protocol so that a recovery
+             * is initiated by a special "RESET" message - disconnect could
+             * just mean we're not allowed to use this interface any more.
+             */
+
+            /* Stop old i/f to prevent errors whilst we rebuild the state. */
+            spin_lock_irq(&np->tx_lock);
+            spin_lock(&np->rx_lock);
+            netif_stop_queue(dev);
+            np->backend_state = BEST_DISCONNECTED;
+            spin_unlock(&np->rx_lock);
+            spin_unlock_irq(&np->tx_lock);
+
+            /* Free resources. */
+            free_irq(np->irq, dev);
+            unbind_evtchn_from_irq(np->evtchn);
+           free_page((unsigned long)np->tx);
+            free_page((unsigned long)np->rx);
+        }
+
+        /* Move from CLOSED to DISCONNECTED state. */
+        np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
+        np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
+        memset(np->tx, 0, PAGE_SIZE);
+        memset(np->rx, 0, PAGE_SIZE);
+        np->backend_state = BEST_DISCONNECTED;
+
+        /* Construct an interface-CONNECT message for the domain controller. */
+        cmsg.type      = CMSG_NETIF_FE;
+        cmsg.subtype   = CMSG_NETIF_FE_INTERFACE_CONNECT;
+        cmsg.length    = sizeof(netif_fe_interface_connect_t);
+        up.handle      = status->handle;
+        up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT;
+        up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT;
+        memcpy(cmsg.msg, &up, sizeof(up));
+        
+        /* Tell the controller to bring up the interface. */
+        ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+        break;
+
+    case NETIF_INTERFACE_STATUS_CONNECTED:
+        if ( np->backend_state == BEST_CLOSED )
+        {
+            printk(KERN_WARNING "Unexpected netif-CONNECTED message"
+                   " in state %d\n", np->backend_state);
+            break;
+        }
+
+        memcpy(dev->dev_addr, status->mac, ETH_ALEN);
+
+        network_connect(dev, status);
+
+        np->evtchn = status->evtchn;
+        np->irq = bind_evtchn_to_irq(np->evtchn);
+        (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, 
+                          dev->name, dev);
+        
+        netctrl_connected_count();
+        break;
+
+    default:
+        printk(KERN_WARNING "Status change to unknown value %d\n", 
+               status->status);
+        break;
+    }
+}
+
+/** Create a network device.
+ * @param handle device handle
+ * @param val return parameter for created device
+ * @return 0 on success, error code otherwise
+ */
+static int create_netdev(int handle, struct net_device **val)
+{
+    int i, err = 0;
+    struct net_device *dev = NULL;
+    struct net_private *np = NULL;
+
+    if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL )
+    {
+        printk(KERN_WARNING "%s> alloc_etherdev failed.\n", __FUNCTION__);
+        err = -ENOMEM;
+        goto exit;
+    }
+
+    np                = dev->priv;
+    np->backend_state = BEST_CLOSED;
+    np->user_state    = UST_CLOSED;
+    np->handle        = handle;
+    
+    spin_lock_init(&np->tx_lock);
+    spin_lock_init(&np->rx_lock);
+
+    /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
+    for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ )
+        np->tx_skbs[i] = (void *)(i+1);
+    for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ )
+        np->rx_skbs[i] = (void *)(i+1);
+
+    dev->open            = network_open;
+    dev->hard_start_xmit = network_start_xmit;
+    dev->stop            = network_close;
+    dev->get_stats       = network_get_stats;
+    dev->poll            = netif_poll;
+    dev->weight          = 64;
+    
+    if ( (err = register_netdev(dev)) != 0 )
+    {
+        printk(KERN_WARNING "%s> register_netdev err=%d\n", __FUNCTION__, err);
+        goto exit;
+    }
+    np->dev = dev;
+    list_add(&np->list, &dev_list);
+
+  exit:
+    if ( (err != 0) && (dev != NULL ) )
+        kfree(dev);
+    else if ( val != NULL )
+        *val = dev;
+    return err;
+}
+
+/*
+ * Initialize the network control interface. Set the number of network devices
+ * and create them.
+ */
+static void netif_driver_status_change(
+    netif_fe_driver_status_changed_t *status)
+{
+    int err = 0;
+    int i;
+    
+    netctrl.interface_n = status->nr_interfaces;
+    netctrl.connected_n = 0;
+
+    for ( i = 0; i < netctrl.interface_n; i++ )
+    {
+        if ( (err = create_netdev(i, NULL)) != 0 )
+        {
+            netctrl_err(err);
+            break;
+        }
+    }
+}
+
+static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+    int respond = 1;
+
+    switch ( msg->subtype )
+    {
+    case CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED:
+        if ( msg->length != sizeof(netif_fe_interface_status_changed_t) )
+            goto error;
+        netif_status_change((netif_fe_interface_status_changed_t *)
+                            &msg->msg[0]);
+        break;
+
+    case CMSG_NETIF_FE_DRIVER_STATUS_CHANGED:
+        if ( msg->length != sizeof(netif_fe_driver_status_changed_t) )
+            goto error;
+        netif_driver_status_change((netif_fe_driver_status_changed_t *)
+                                   &msg->msg[0]);
+        /* Message is a response */
+        respond = 0;
+        break;
+
+    error:
+    default:
+        msg->length = 0;
+        break;
+    }
+
+    if ( respond )
+        ctrl_if_send_response(msg);
+}
+
+
+static int __init netif_init(void)
+{
+    ctrl_msg_t                       cmsg;
+    netif_fe_driver_status_changed_t st;
+    int err = 0, wait_i, wait_n = 20;
+
+    if ( (start_info.flags & SIF_INITDOMAIN) ||
+         (start_info.flags & SIF_NET_BE_DOMAIN) )
+        return 0;
+
+    printk("Initialising Xen virtual ethernet frontend driver");
+
+    INIT_LIST_HEAD(&dev_list);
+
+    netctrl_init();
+
+    (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx,
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
+
+    /* Send a driver-UP notification to the domain controller. */
+    cmsg.type      = CMSG_NETIF_FE;
+    cmsg.subtype   = CMSG_NETIF_FE_DRIVER_STATUS_CHANGED;
+    cmsg.length    = sizeof(netif_fe_driver_status_changed_t);
+    st.status      = NETIF_DRIVER_STATUS_UP;
+    st.nr_interfaces = 0;
+    memcpy(cmsg.msg, &st, sizeof(st));
+    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+
+    /* Wait for all interfaces to be connected. */
+    for ( wait_i = 0; ; wait_i++)
+    {
+        if ( (err = (wait_i < wait_n) ? netctrl_connected() : -ENETDOWN) != 0 )
+        {
+            err = (err > 0) ? 0 : err;
+            break;
+        }
+        set_current_state(TASK_INTERRUPTIBLE);
+        schedule_timeout(1);
+     }
+
+    if ( err )
+        ctrl_if_unregister_receiver(CMSG_NETIF_FE, netif_ctrlif_rx);
+
+    return err;
+}
+
+__initcall(netif_init);
index ff65b7eb416e5ca808933f26cc19c61365a4be3f..49bf0975ecdd208e8e1761156e80c74c88917b7b 100644 (file)
@@ -151,6 +151,12 @@ static inline int flush_page_update_queue(void)
 #define xen_flush_page_update_queue() (_flush_page_update_queue())
 void MULTICALL_flush_page_update_queue(void);
 
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+/* Allocate a contiguous empty region of low memory. Return virtual start. */
+unsigned long allocate_empty_lowmem_region(unsigned long pages);
+/* Deallocate a contiguous region of low memory. Return it to the allocator. */
+void deallocate_lowmem_region(unsigned long vstart, unsigned long pages);
+#endif
 
 /*
  * Assembler stubs for hyper-calls.
@@ -389,9 +395,12 @@ static inline int HYPERVISOR_update_va_mapping(
         "b" (page_nr), "c" ((new_val).pte_low), "d" (flags) : "memory" );
 
     if ( unlikely(ret < 0) )
-        panic("Failed update VA mapping: %08lx, %08lx, %08lx",
-              page_nr, (new_val).pte_low, flags);
-    
+    {
+        printk(KERN_ALERT "Failed update VA mapping: %08lx, %08lx, %08lx\n",
+               page_nr, (new_val).pte_low, flags);
+        BUG();
+    }
+
     return ret;
 }
 
diff --git a/linux-2.6.7-xen-sparse/mm/page_alloc.c b/linux-2.6.7-xen-sparse/mm/page_alloc.c
new file mode 100644 (file)
index 0000000..5d9b765
--- /dev/null
@@ -0,0 +1,2017 @@
+/*
+ *  linux/mm/page_alloc.c
+ *
+ *  Manages the free list, the system allocates free pages here.
+ *  Note that kmalloc() lives in slab.c
+ *
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *  Swap reorganised 29.12.95, Stephen Tweedie
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
+ *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
+ *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
+ */
+
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/pagevec.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/topology.h>
+#include <linux/sysctl.h>
+#include <linux/cpu.h>
+
+#include <asm/tlbflush.h>
+
+DECLARE_BITMAP(node_online_map, MAX_NUMNODES);
+struct pglist_data *pgdat_list;
+unsigned long totalram_pages;
+unsigned long totalhigh_pages;
+int nr_swap_pages;
+int numnodes = 1;
+int sysctl_lower_zone_protection = 0;
+
+EXPORT_SYMBOL(totalram_pages);
+EXPORT_SYMBOL(nr_swap_pages);
+
+/*
+ * Used by page_zone() to look up the address of the struct zone whose
+ * id is encoded in the upper bits of page->flags
+ */
+struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
+EXPORT_SYMBOL(zone_table);
+
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+int min_free_kbytes = 1024;
+
+/*
+ * Temporary debugging check for pages not lying within a given zone.
+ */
+static int bad_range(struct zone *zone, struct page *page)
+{
+       if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
+               return 1;
+       if (page_to_pfn(page) < zone->zone_start_pfn)
+               return 1;
+       if (zone != page_zone(page))
+               return 1;
+       return 0;
+}
+
+static void bad_page(const char *function, struct page *page)
+{
+       printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
+               function, current->comm, page);
+       printk(KERN_EMERG "flags:0x%08lx mapping:%p mapcount:%d count:%d\n",
+               (unsigned long)page->flags, page->mapping,
+               (int)page->mapcount, page_count(page));
+       printk(KERN_EMERG "Backtrace:\n");
+       dump_stack();
+       printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
+       page->flags &= ~(1 << PG_private        |
+                       1 << PG_locked  |
+                       1 << PG_lru     |
+                       1 << PG_active  |
+                       1 << PG_dirty   |
+                       1 << PG_maplock |
+                       1 << PG_anon    |
+                       1 << PG_swapcache |
+                       1 << PG_writeback);
+       set_page_count(page, 0);
+       page->mapping = NULL;
+       page->mapcount = 0;
+}
+
+#ifndef CONFIG_HUGETLB_PAGE
+#define prep_compound_page(page, order) do { } while (0)
+#define destroy_compound_page(page, order) do { } while (0)
+#else
+/*
+ * Higher-order pages are called "compound pages".  They are structured thusly:
+ *
+ * The first PAGE_SIZE page is called the "head page".
+ *
+ * The remaining PAGE_SIZE pages are called "tail pages".
+ *
+ * All pages have PG_compound set.  All pages have their ->private pointing at
+ * the head page (even the head page has this).
+ *
+ * The first tail page's ->mapping, if non-zero, holds the address of the
+ * compound page's put_page() function.
+ *
+ * The order of the allocation is stored in the first tail page's ->index
+ * This is only for debug at present.  This usage means that zero-order pages
+ * may not be compound.
+ */
+static void prep_compound_page(struct page *page, unsigned long order)
+{
+       int i;
+       int nr_pages = 1 << order;
+
+       page[1].mapping = 0;
+       page[1].index = order;
+       for (i = 0; i < nr_pages; i++) {
+               struct page *p = page + i;
+
+               SetPageCompound(p);
+               p->private = (unsigned long)page;
+       }
+}
+
+static void destroy_compound_page(struct page *page, unsigned long order)
+{
+       int i;
+       int nr_pages = 1 << order;
+
+       if (!PageCompound(page))
+               return;
+
+       if (page[1].index != order)
+               bad_page(__FUNCTION__, page);
+
+       for (i = 0; i < nr_pages; i++) {
+               struct page *p = page + i;
+
+               if (!PageCompound(p))
+                       bad_page(__FUNCTION__, page);
+               if (p->private != (unsigned long)page)
+                       bad_page(__FUNCTION__, page);
+               ClearPageCompound(p);
+       }
+}
+#endif         /* CONFIG_HUGETLB_PAGE */
+
+/*
+ * Freeing function for a buddy system allocator.
+ *
+ * The concept of a buddy system is to maintain direct-mapped table
+ * (containing bit values) for memory blocks of various "orders".
+ * The bottom level table contains the map for the smallest allocatable
+ * units of memory (here, pages), and each level above it describes
+ * pairs of units from the levels below, hence, "buddies".
+ * At a high level, all that happens here is marking the table entry
+ * at the bottom level available, and propagating the changes upward
+ * as necessary, plus some accounting needed to play nicely with other
+ * parts of the VM system.
+ * At each level, we keep one bit for each pair of blocks, which
+ * is set to 1 iff only one of the pair is allocated.  So when we
+ * are allocating or freeing one, we can derive the state of the
+ * other.  That is, if we allocate a small block, and both were   
+ * free, the remainder of the region must be split into blocks.   
+ * If a block is freed, and its buddy is also free, then this
+ * triggers coalescing into a block of larger size.            
+ *
+ * -- wli
+ */
+
+static inline void __free_pages_bulk (struct page *page, struct page *base,
+               struct zone *zone, struct free_area *area, unsigned long mask,
+               unsigned int order)
+{
+       unsigned long page_idx, index;
+
+       if (order)
+               destroy_compound_page(page, order);
+       page_idx = page - base;
+       if (page_idx & ~mask)
+               BUG();
+       index = page_idx >> (1 + order);
+
+       zone->free_pages -= mask;
+       while (mask + (1 << (MAX_ORDER-1))) {
+               struct page *buddy1, *buddy2;
+
+               BUG_ON(area >= zone->free_area + MAX_ORDER);
+               if (!__test_and_change_bit(index, area->map))
+                       /*
+                        * the buddy page is still allocated.
+                        */
+                       break;
+               /*
+                * Move the buddy up one level.
+                * This code is taking advantage of the identity:
+                *      -mask = 1+~mask
+                */
+               buddy1 = base + (page_idx ^ -mask);
+               buddy2 = base + page_idx;
+               BUG_ON(bad_range(zone, buddy1));
+               BUG_ON(bad_range(zone, buddy2));
+               list_del(&buddy1->lru);
+               mask <<= 1;
+               area++;
+               index >>= 1;
+               page_idx &= mask;
+       }
+       list_add(&(base + page_idx)->lru, &area->free_list);
+}
+
+static inline void free_pages_check(const char *function, struct page *page)
+{
+       if (    page_mapped(page) ||
+               page->mapping != NULL ||
+               page_count(page) != 0 ||
+               (page->flags & (
+                       1 << PG_lru     |
+                       1 << PG_private |
+                       1 << PG_locked  |
+                       1 << PG_active  |
+                       1 << PG_reclaim |
+                       1 << PG_slab    |
+                       1 << PG_maplock |
+                       1 << PG_anon    |
+                       1 << PG_swapcache |
+                       1 << PG_writeback )))
+               bad_page(function, page);
+       if (PageDirty(page))
+               ClearPageDirty(page);
+}
+
+/*
+ * Frees a list of pages. 
+ * Assumes all pages on list are in same zone, and of same order.
+ * count is the number of pages to free, or 0 for all on the list.
+ *
+ * If the zone was previously in an "all pages pinned" state then look to
+ * see if this freeing clears that state.
+ *
+ * And clear the zone's pages_scanned counter, to hold off the "all pages are
+ * pinned" detection logic.
+ */
+static int
+free_pages_bulk(struct zone *zone, int count,
+               struct list_head *list, unsigned int order)
+{
+       unsigned long mask, flags;
+       struct free_area *area;
+       struct page *base, *page = NULL;
+       int ret = 0;
+
+       mask = (~0UL) << order;
+       base = zone->zone_mem_map;
+       area = zone->free_area + order;
+       spin_lock_irqsave(&zone->lock, flags);
+       zone->all_unreclaimable = 0;
+       zone->pages_scanned = 0;
+       while (!list_empty(list) && count--) {
+               page = list_entry(list->prev, struct page, lru);
+               /* have to delete it as __free_pages_bulk list manipulates */
+               list_del(&page->lru);
+               __free_pages_bulk(page, base, zone, area, mask, order);
+               ret++;
+       }
+       spin_unlock_irqrestore(&zone->lock, flags);
+       return ret;
+}
+
+void __free_pages_ok(struct page *page, unsigned int order)
+{
+       LIST_HEAD(list);
+       int i;
+
+       mod_page_state(pgfree, 1 << order);
+       for (i = 0 ; i < (1 << order) ; ++i)
+               free_pages_check(__FUNCTION__, page + i);
+       list_add(&page->lru, &list);
+       kernel_map_pages(page, 1<<order, 0);
+       free_pages_bulk(page_zone(page), 1, &list, order);
+}
+
+#define MARK_USED(index, order, area) \
+       __change_bit((index) >> (1+(order)), (area)->map)
+
+static inline struct page *
+expand(struct zone *zone, struct page *page,
+        unsigned long index, int low, int high, struct free_area *area)
+{
+       unsigned long size = 1 << high;
+
+       while (high > low) {
+               BUG_ON(bad_range(zone, page));
+               area--;
+               high--;
+               size >>= 1;
+               list_add(&page->lru, &area->free_list);
+               MARK_USED(index, high, area);
+               index += size;
+               page += size;
+       }
+       return page;
+}
+
+static inline void set_page_refs(struct page *page, int order)
+{
+#ifdef CONFIG_MMU
+       set_page_count(page, 1);
+#else
+       int i;
+
+       /*
+        * We need to reference all the pages for this order, otherwise if
+        * anyone accesses one of the pages with (get/put) it will be freed.
+        */
+       for (i = 0; i < (1 << order); i++)
+               set_page_count(page+i, 1);
+#endif /* CONFIG_MMU */
+}
+
+/*
+ * This page is about to be returned from the page allocator
+ */
+static void prep_new_page(struct page *page, int order)
+{
+       if (page->mapping || page_mapped(page) ||
+           (page->flags & (
+                       1 << PG_private |
+                       1 << PG_locked  |
+                       1 << PG_lru     |
+                       1 << PG_active  |
+                       1 << PG_dirty   |
+                       1 << PG_reclaim |
+                       1 << PG_maplock |
+                       1 << PG_anon    |
+                       1 << PG_swapcache |
+                       1 << PG_writeback )))
+               bad_page(__FUNCTION__, page);
+
+       page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
+                       1 << PG_referenced | 1 << PG_arch_1 |
+                       1 << PG_checked | 1 << PG_mappedtodisk);
+       page->private = 0;
+       set_page_refs(page, order);
+}
+
+/* 
+ * Do the hard work of removing an element from the buddy allocator.
+ * Call me with the zone->lock already held.
+ */
+static struct page *__rmqueue(struct zone *zone, unsigned int order)
+{
+       struct free_area * area;
+       unsigned int current_order;
+       struct page *page;
+       unsigned int index;
+
+       for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+               area = zone->free_area + current_order;
+               if (list_empty(&area->free_list))
+                       continue;
+
+               page = list_entry(area->free_list.next, struct page, lru);
+               list_del(&page->lru);
+               index = page - zone->zone_mem_map;
+               if (current_order != MAX_ORDER-1)
+                       MARK_USED(index, current_order, area);
+               zone->free_pages -= 1UL << order;
+               return expand(zone, page, index, order, current_order, area);
+       }
+
+       return NULL;
+}
+
+/* 
+ * Obtain a specified number of elements from the buddy allocator, all under
+ * a single hold of the lock, for efficiency.  Add them to the supplied list.
+ * Returns the number of new pages which were placed at *list.
+ */
+static int rmqueue_bulk(struct zone *zone, unsigned int order, 
+                       unsigned long count, struct list_head *list)
+{
+       unsigned long flags;
+       int i;
+       int allocated = 0;
+       struct page *page;
+       
+       spin_lock_irqsave(&zone->lock, flags);
+       for (i = 0; i < count; ++i) {
+               page = __rmqueue(zone, order);
+               if (page == NULL)
+                       break;
+               allocated++;
+               list_add_tail(&page->lru, list);
+       }
+       spin_unlock_irqrestore(&zone->lock, flags);
+       return allocated;
+}
+
+#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
+static void __drain_pages(unsigned int cpu)
+{
+       struct zone *zone;
+       int i;
+
+       for_each_zone(zone) {
+               struct per_cpu_pageset *pset;
+
+               pset = &zone->pageset[cpu];
+               for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+                       struct per_cpu_pages *pcp;
+
+                       pcp = &pset->pcp[i];
+                       pcp->count -= free_pages_bulk(zone, pcp->count,
+                                               &pcp->list, 0);
+               }
+       }
+}
+#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
+
+#ifdef CONFIG_PM
+int is_head_of_free_region(struct page *page)
+{
+        struct zone *zone = page_zone(page);
+        unsigned long flags;
+       int order;
+       struct list_head *curr;
+
+       /*
+        * Should not matter as we need quiescent system for
+        * suspend anyway, but...
+        */
+       spin_lock_irqsave(&zone->lock, flags);
+       for (order = MAX_ORDER - 1; order >= 0; --order)
+               list_for_each(curr, &zone->free_area[order].free_list)
+                       if (page == list_entry(curr, struct page, lru)) {
+                               spin_unlock_irqrestore(&zone->lock, flags);
+                               return 1 << order;
+                       }
+       spin_unlock_irqrestore(&zone->lock, flags);
+        return 0;
+}
+
+/*
+ * Spill all of this CPU's per-cpu pages back into the buddy allocator.
+ */
+void drain_local_pages(void)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);  
+       __drain_pages(smp_processor_id());
+       local_irq_restore(flags);       
+}
+#endif /* CONFIG_PM */
+
+static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+{
+#ifdef CONFIG_NUMA
+       unsigned long flags;
+       int cpu;
+       pg_data_t *pg = z->zone_pgdat;
+       pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
+       struct per_cpu_pageset *p;
+
+       local_irq_save(flags);
+       cpu = smp_processor_id();
+       p = &z->pageset[cpu];
+       if (pg == orig) {
+               z->pageset[cpu].numa_hit++;
+       } else {
+               p->numa_miss++;
+               zonelist->zones[0]->pageset[cpu].numa_foreign++;
+       }
+       if (pg == NODE_DATA(numa_node_id()))
+               p->local_node++;
+       else
+               p->other_node++;
+       local_irq_restore(flags);
+#endif
+}
+
+/*
+ * Free a 0-order page
+ */
+static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
+static void fastcall free_hot_cold_page(struct page *page, int cold)
+{
+       struct zone *zone = page_zone(page);
+       struct per_cpu_pages *pcp;
+       unsigned long flags;
+
+       /* XXX Xen: use mapping pointer as skb/data-page destructor */
+       if (page->mapping)
+               return (*(void(*)(struct page *))page->mapping)(page);
+
+       kernel_map_pages(page, 1, 0);
+       inc_page_state(pgfree);
+       free_pages_check(__FUNCTION__, page);
+       pcp = &zone->pageset[get_cpu()].pcp[cold];
+       local_irq_save(flags);
+       if (pcp->count >= pcp->high)
+               pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+       list_add(&page->lru, &pcp->list);
+       pcp->count++;
+       local_irq_restore(flags);
+       put_cpu();
+}
+
+void fastcall free_hot_page(struct page *page)
+{
+       free_hot_cold_page(page, 0);
+}
+       
+void fastcall free_cold_page(struct page *page)
+{
+       free_hot_cold_page(page, 1);
+}
+
+/*
+ * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
+ * we cheat by calling it from here, in the order > 0 path.  Saves a branch
+ * or two.
+ */
+
+static struct page *
+buffered_rmqueue(struct zone *zone, int order, int gfp_flags)
+{
+       unsigned long flags;
+       struct page *page = NULL;
+       int cold = !!(gfp_flags & __GFP_COLD);
+
+       if (order == 0) {
+               struct per_cpu_pages *pcp;
+
+               pcp = &zone->pageset[get_cpu()].pcp[cold];
+               local_irq_save(flags);
+               if (pcp->count <= pcp->low)
+                       pcp->count += rmqueue_bulk(zone, 0,
+                                               pcp->batch, &pcp->list);
+               if (pcp->count) {
+                       page = list_entry(pcp->list.next, struct page, lru);
+                       list_del(&page->lru);
+                       pcp->count--;
+               }
+               local_irq_restore(flags);
+               put_cpu();
+       }
+
+       if (page == NULL) {
+               spin_lock_irqsave(&zone->lock, flags);
+               page = __rmqueue(zone, order);
+               spin_unlock_irqrestore(&zone->lock, flags);
+       }
+
+       if (page != NULL) {
+               BUG_ON(bad_range(zone, page));
+               mod_page_state_zone(zone, pgalloc, 1 << order);
+               prep_new_page(page, order);
+               if (order && (gfp_flags & __GFP_COMP))
+                       prep_compound_page(page, order);
+       }
+       return page;
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ *
+ * Herein lies the mysterious "incremental min".  That's the
+ *
+ *     local_low = z->pages_low;
+ *     min += local_low;
+ *
+ * thing.  The intent here is to provide additional protection to low zones for
+ * allocation requests which _could_ use higher zones.  So a GFP_HIGHMEM
+ * request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL
+ * request.  This preserves additional space in those lower zones for requests
+ * which really do need memory from those zones.  It means that on a decent
+ * sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA
+ * zone untouched.
+ */
+struct page * fastcall
+__alloc_pages(unsigned int gfp_mask, unsigned int order,
+               struct zonelist *zonelist)
+{
+       const int wait = gfp_mask & __GFP_WAIT;
+       unsigned long min;
+       struct zone **zones;
+       struct page *page;
+       struct reclaim_state reclaim_state;
+       struct task_struct *p = current;
+       int i;
+       int alloc_type;
+       int do_retry;
+
+       might_sleep_if(wait);
+
+       zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
+       if (zones[0] == NULL)     /* no zones in the zonelist */
+               return NULL;
+
+       alloc_type = zone_idx(zones[0]);
+
+       /* Go through the zonelist once, looking for a zone with enough free */
+       for (i = 0; zones[i] != NULL; i++) {
+               struct zone *z = zones[i];
+
+               min = (1<<order) + z->protection[alloc_type];
+
+               /*
+                * We let real-time tasks dip their real-time paws a little
+                * deeper into reserves.
+                */
+               if (rt_task(p))
+                       min -= z->pages_low >> 1;
+
+               if (z->free_pages >= min ||
+                               (!wait && z->free_pages >= z->pages_high)) {
+                       page = buffered_rmqueue(z, order, gfp_mask);
+                       if (page) {
+                               zone_statistics(zonelist, z);
+                               goto got_pg;
+                       }
+               }
+       }
+
+       /* we're somewhat low on memory, failed to find what we needed */
+       for (i = 0; zones[i] != NULL; i++)
+               wakeup_kswapd(zones[i]);
+
+       /* Go through the zonelist again, taking __GFP_HIGH into account */
+       for (i = 0; zones[i] != NULL; i++) {
+               struct zone *z = zones[i];
+
+               min = (1<<order) + z->protection[alloc_type];
+
+               if (gfp_mask & __GFP_HIGH)
+                       min -= z->pages_low >> 2;
+               if (rt_task(p))
+                       min -= z->pages_low >> 1;
+
+               if (z->free_pages >= min ||
+                               (!wait && z->free_pages >= z->pages_high)) {
+                       page = buffered_rmqueue(z, order, gfp_mask);
+                       if (page) {
+                               zone_statistics(zonelist, z);
+                               goto got_pg;
+                       }
+               }
+       }
+
+       /* here we're in the low on memory slow path */
+
+rebalance:
+       if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) {
+               /* go through the zonelist yet again, ignoring mins */
+               for (i = 0; zones[i] != NULL; i++) {
+                       struct zone *z = zones[i];
+
+                       page = buffered_rmqueue(z, order, gfp_mask);
+                       if (page) {
+                               zone_statistics(zonelist, z);
+                               goto got_pg;
+                       }
+               }
+               goto nopage;
+       }
+
+       /* Atomic allocations - we can't balance anything */
+       if (!wait)
+               goto nopage;
+
+       p->flags |= PF_MEMALLOC;
+       reclaim_state.reclaimed_slab = 0;
+       p->reclaim_state = &reclaim_state;
+
+       try_to_free_pages(zones, gfp_mask, order);
+
+       p->reclaim_state = NULL;
+       p->flags &= ~PF_MEMALLOC;
+
+       /* go through the zonelist yet one more time */
+       for (i = 0; zones[i] != NULL; i++) {
+               struct zone *z = zones[i];
+
+               min = (1UL << order) + z->protection[alloc_type];
+
+               if (z->free_pages >= min ||
+                               (!wait && z->free_pages >= z->pages_high)) {
+                       page = buffered_rmqueue(z, order, gfp_mask);
+                       if (page) {
+                               zone_statistics(zonelist, z);
+                               goto got_pg;
+                       }
+               }
+       }
+
+       /*
+        * Don't let big-order allocations loop unless the caller explicitly
+        * requests that.  Wait for some write requests to complete then retry.
+        *
+        * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that
+        * may not be true in other implementations.
+        */
+       do_retry = 0;
+       if (!(gfp_mask & __GFP_NORETRY)) {
+               if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
+                       do_retry = 1;
+               if (gfp_mask & __GFP_NOFAIL)
+                       do_retry = 1;
+       }
+       if (do_retry) {
+               blk_congestion_wait(WRITE, HZ/50);
+               goto rebalance;
+       }
+
+nopage:
+       if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
+               printk(KERN_WARNING "%s: page allocation failure."
+                       " order:%d, mode:0x%x\n",
+                       p->comm, order, gfp_mask);
+               dump_stack();
+       }
+       return NULL;
+got_pg:
+       kernel_map_pages(page, 1 << order, 1);
+       return page;
+}
+
+EXPORT_SYMBOL(__alloc_pages);
+
+#ifdef CONFIG_NUMA
+/* Early boot: Everything is done by one cpu, but the data structures will be
+ * used by all cpus - spread them on all nodes.
+ */
+static __init unsigned long get_boot_pages(unsigned int gfp_mask, unsigned int order)
+{
+static int nodenr;
+       int i = nodenr;
+       struct page *page;
+
+       for (;;) {
+               if (i > nodenr + numnodes)
+                       return 0;
+               if (node_present_pages(i%numnodes)) {
+                       struct zone **z;
+                       /* The node contains memory. Check that there is
+                        * memory in the intended zonelist.
+                        */
+                       z = NODE_DATA(i%numnodes)->node_zonelists[gfp_mask & GFP_ZONEMASK].zones;
+                       while (*z) {
+                               if ( (*z)->free_pages > (1UL<<order))
+                                       goto found_node;
+                               z++;
+                       }
+               }
+               i++;
+       }
+found_node:
+       nodenr = i+1;
+       page = alloc_pages_node(i%numnodes, gfp_mask, order);
+       if (!page)
+               return 0;
+       return (unsigned long) page_address(page);
+}
+#endif
+
+/*
+ * Common helper functions.
+ */
+fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
+{
+       struct page * page;
+
+#ifdef CONFIG_NUMA
+       if (unlikely(system_state == SYSTEM_BOOTING))
+               return get_boot_pages(gfp_mask, order);
+#endif
+       page = alloc_pages(gfp_mask, order);
+       if (!page)
+               return 0;
+       return (unsigned long) page_address(page);
+}
+
+EXPORT_SYMBOL(__get_free_pages);
+
+fastcall unsigned long get_zeroed_page(unsigned int gfp_mask)
+{
+       struct page * page;
+
+       /*
+        * get_zeroed_page() returns a 32-bit address, which cannot represent
+        * a highmem page
+        */
+       BUG_ON(gfp_mask & __GFP_HIGHMEM);
+
+       page = alloc_pages(gfp_mask, 0);
+       if (page) {
+               void *address = page_address(page);
+               clear_page(address);
+               return (unsigned long) address;
+       }
+       return 0;
+}
+
+EXPORT_SYMBOL(get_zeroed_page);
+
+void __pagevec_free(struct pagevec *pvec)
+{
+       int i = pagevec_count(pvec);
+
+       while (--i >= 0)
+               free_hot_cold_page(pvec->pages[i], pvec->cold);
+}
+
+fastcall void __free_pages(struct page *page, unsigned int order)
+{
+       if (!PageReserved(page) && put_page_testzero(page)) {
+               if (order == 0)
+                       free_hot_page(page);
+               else
+                       __free_pages_ok(page, order);
+       }
+}
+
+EXPORT_SYMBOL(__free_pages);
+
+fastcall void free_pages(unsigned long addr, unsigned int order)
+{
+       if (addr != 0) {
+               BUG_ON(!virt_addr_valid(addr));
+               __free_pages(virt_to_page(addr), order);
+       }
+}
+
+EXPORT_SYMBOL(free_pages);
+
+/*
+ * Total amount of free (allocatable) RAM:
+ */
+unsigned int nr_free_pages(void)
+{
+       unsigned int sum = 0;
+       struct zone *zone;
+
+       for_each_zone(zone)
+               sum += zone->free_pages;
+
+       return sum;
+}
+
+EXPORT_SYMBOL(nr_free_pages);
+
+unsigned int nr_used_zone_pages(void)
+{
+       unsigned int pages = 0;
+       struct zone *zone;
+
+       for_each_zone(zone)
+               pages += zone->nr_active + zone->nr_inactive;
+
+       return pages;
+}
+
+#ifdef CONFIG_NUMA
+unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
+{
+       unsigned int i, sum = 0;
+
+       for (i = 0; i < MAX_NR_ZONES; i++)
+               sum += pgdat->node_zones[i].free_pages;
+
+       return sum;
+}
+#endif
+
+static unsigned int nr_free_zone_pages(int offset)
+{
+       pg_data_t *pgdat;
+       unsigned int sum = 0;
+
+       for_each_pgdat(pgdat) {
+               struct zonelist *zonelist = pgdat->node_zonelists + offset;
+               struct zone **zonep = zonelist->zones;
+               struct zone *zone;
+
+               for (zone = *zonep++; zone; zone = *zonep++) {
+                       unsigned long size = zone->present_pages;
+                       unsigned long high = zone->pages_high;
+                       if (size > high)
+                               sum += size - high;
+               }
+       }
+
+       return sum;
+}
+
+/*
+ * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
+ */
+unsigned int nr_free_buffer_pages(void)
+{
+       return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK);
+}
+
+/*
+ * Amount of free RAM allocatable within all zones
+ */
+unsigned int nr_free_pagecache_pages(void)
+{
+       return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
+}
+
+#ifdef CONFIG_HIGHMEM
+unsigned int nr_free_highpages (void)
+{
+       pg_data_t *pgdat;
+       unsigned int pages = 0;
+
+       for_each_pgdat(pgdat)
+               pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+
+       return pages;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+static void show_node(struct zone *zone)
+{
+       printk("Node %d ", zone->zone_pgdat->node_id);
+}
+#else
+#define show_node(zone)        do { } while (0)
+#endif
+
+/*
+ * Accumulate the page_state information across all CPUs.
+ * The result is unavoidably approximate - it can change
+ * during and after execution of this function.
+ */
+DEFINE_PER_CPU(struct page_state, page_states) = {0};
+EXPORT_PER_CPU_SYMBOL(page_states);
+
+atomic_t nr_pagecache = ATOMIC_INIT(0);
+EXPORT_SYMBOL(nr_pagecache);
+#ifdef CONFIG_SMP
+DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
+#endif
+
+void __get_page_state(struct page_state *ret, int nr)
+{
+       int cpu = 0;
+
+       memset(ret, 0, sizeof(*ret));
+       while (cpu < NR_CPUS) {
+               unsigned long *in, *out, off;
+
+               if (!cpu_possible(cpu)) {
+                       cpu++;
+                       continue;
+               }
+
+               in = (unsigned long *)&per_cpu(page_states, cpu);
+               cpu++;
+               if (cpu < NR_CPUS && cpu_possible(cpu))
+                       prefetch(&per_cpu(page_states, cpu));
+               out = (unsigned long *)ret;
+               for (off = 0; off < nr; off++)
+                       *out++ += *in++;
+       }
+}
+
+void get_page_state(struct page_state *ret)
+{
+       int nr;
+
+       nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
+       nr /= sizeof(unsigned long);
+
+       __get_page_state(ret, nr + 1);
+}
+
+void get_full_page_state(struct page_state *ret)
+{
+       __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
+}
+
+unsigned long __read_page_state(unsigned offset)
+{
+       unsigned long ret = 0;
+       int cpu;
+
+       for (cpu = 0; cpu < NR_CPUS; cpu++) {
+               unsigned long in;
+
+               if (!cpu_possible(cpu))
+                       continue;
+
+               in = (unsigned long)&per_cpu(page_states, cpu) + offset;
+               ret += *((unsigned long *)in);
+       }
+       return ret;
+}
+
+void get_zone_counts(unsigned long *active,
+               unsigned long *inactive, unsigned long *free)
+{
+       struct zone *zone;
+
+       *active = 0;
+       *inactive = 0;
+       *free = 0;
+       for_each_zone(zone) {
+               *active += zone->nr_active;
+               *inactive += zone->nr_inactive;
+               *free += zone->free_pages;
+       }
+}
+
+void si_meminfo(struct sysinfo *val)
+{
+       val->totalram = totalram_pages;
+       val->sharedram = 0;
+       val->freeram = nr_free_pages();
+       val->bufferram = nr_blockdev_pages();
+#ifdef CONFIG_HIGHMEM
+       val->totalhigh = totalhigh_pages;
+       val->freehigh = nr_free_highpages();
+#else
+       val->totalhigh = 0;
+       val->freehigh = 0;
+#endif
+       val->mem_unit = PAGE_SIZE;
+}
+
+EXPORT_SYMBOL(si_meminfo);
+
+#ifdef CONFIG_NUMA
+void si_meminfo_node(struct sysinfo *val, int nid)
+{
+       pg_data_t *pgdat = NODE_DATA(nid);
+
+       val->totalram = pgdat->node_present_pages;
+       val->freeram = nr_free_pages_pgdat(pgdat);
+       val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
+       val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+       val->mem_unit = PAGE_SIZE;
+}
+#endif
+
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+/*
+ * Show free area list (used inside shift_scroll-lock stuff)
+ * We also calculate the percentage fragmentation. We do this by counting the
+ * memory on each free list with the exception of the first item on the list.
+ */
+void show_free_areas(void)
+{
+       struct page_state ps;
+       int cpu, temperature;
+       unsigned long active;
+       unsigned long inactive;
+       unsigned long free;
+       struct zone *zone;
+
+       for_each_zone(zone) {
+               show_node(zone);
+               printk("%s per-cpu:", zone->name);
+
+               if (!zone->present_pages) {
+                       printk(" empty\n");
+                       continue;
+               } else
+                       printk("\n");
+
+               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+                       struct per_cpu_pageset *pageset;
+
+                       if (!cpu_possible(cpu))
+                               continue;
+
+                       pageset = zone->pageset + cpu;
+
+                       for (temperature = 0; temperature < 2; temperature++)
+                               printk("cpu %d %s: low %d, high %d, batch %d\n",
+                                       cpu,
+                                       temperature ? "cold" : "hot",
+                                       pageset->pcp[temperature].low,
+                                       pageset->pcp[temperature].high,
+                                       pageset->pcp[temperature].batch);
+               }
+       }
+
+       get_page_state(&ps);
+       get_zone_counts(&active, &inactive, &free);
+
+       printk("\nFree pages: %11ukB (%ukB HighMem)\n",
+               K(nr_free_pages()),
+               K(nr_free_highpages()));
+
+       printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
+               "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
+               active,
+               inactive,
+               ps.nr_dirty,
+               ps.nr_writeback,
+               ps.nr_unstable,
+               nr_free_pages(),
+               ps.nr_slab,
+               ps.nr_mapped,
+               ps.nr_page_table_pages);
+
+       for_each_zone(zone) {
+               int i;
+
+               show_node(zone);
+               printk("%s"
+                       " free:%lukB"
+                       " min:%lukB"
+                       " low:%lukB"
+                       " high:%lukB"
+                       " active:%lukB"
+                       " inactive:%lukB"
+                       " present:%lukB"
+                       "\n",
+                       zone->name,
+                       K(zone->free_pages),
+                       K(zone->pages_min),
+                       K(zone->pages_low),
+                       K(zone->pages_high),
+                       K(zone->nr_active),
+                       K(zone->nr_inactive),
+                       K(zone->present_pages)
+                       );
+               printk("protections[]:");
+               for (i = 0; i < MAX_NR_ZONES; i++)
+                       printk(" %lu", zone->protection[i]);
+               printk("\n");
+       }
+
+       for_each_zone(zone) {
+               struct list_head *elem;
+               unsigned long nr, flags, order, total = 0;
+
+               show_node(zone);
+               printk("%s: ", zone->name);
+               if (!zone->present_pages) {
+                       printk("empty\n");
+                       continue;
+               }
+
+               spin_lock_irqsave(&zone->lock, flags);
+               for (order = 0; order < MAX_ORDER; order++) {
+                       nr = 0;
+                       list_for_each(elem, &zone->free_area[order].free_list)
+                               ++nr;
+                       total += nr << order;
+                       printk("%lu*%lukB ", nr, K(1UL) << order);
+               }
+               spin_unlock_irqrestore(&zone->lock, flags);
+               printk("= %lukB\n", K(total));
+       }
+
+       show_swap_cache_info();
+}
+
+/*
+ * Builds allocation fallback zone lists.
+ */
+static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
+{
+       switch (k) {
+               struct zone *zone;
+       default:
+               BUG();
+       case ZONE_HIGHMEM:
+               zone = pgdat->node_zones + ZONE_HIGHMEM;
+               if (zone->present_pages) {
+#ifndef CONFIG_HIGHMEM
+                       BUG();
+#endif
+                       zonelist->zones[j++] = zone;
+               }
+       case ZONE_NORMAL:
+               zone = pgdat->node_zones + ZONE_NORMAL;
+               if (zone->present_pages)
+                       zonelist->zones[j++] = zone;
+       case ZONE_DMA:
+               zone = pgdat->node_zones + ZONE_DMA;
+               if (zone->present_pages)
+                       zonelist->zones[j++] = zone;
+       }
+
+       return j;
+}
+
+#ifdef CONFIG_NUMA
+#define MAX_NODE_LOAD (numnodes)
+static int __initdata node_load[MAX_NUMNODES];
+/**
+ * find_next_best_node - find the next node that should appear in a given
+ *    node's fallback list
+ * @node: node whose fallback list we're appending
+ * @used_node_mask: pointer to the bitmap of already used nodes
+ *
+ * We use a number of factors to determine which is the next node that should
+ * appear on a given node's fallback list.  The node should not have appeared
+ * already in @node's fallback list, and it should be the next closest node
+ * according to the distance array (which contains arbitrary distance values
+ * from each node to each node in the system), and should also prefer nodes
+ * with no CPUs, since presumably they'll have very little allocation pressure
+ * on them otherwise.
+ * It returns -1 if no node is found.
+ */
+static int __init find_next_best_node(int node, void *used_node_mask)
+{
+       int i, n, val;
+       int min_val = INT_MAX;
+       int best_node = -1;
+
+       for (i = 0; i < numnodes; i++) {
+               cpumask_t tmp;
+
+               /* Start from local node */
+               n = (node+i)%numnodes;
+
+               /* Don't want a node to appear more than once */
+               if (test_bit(n, used_node_mask))
+                       continue;
+
+               /* Use the distance array to find the distance */
+               val = node_distance(node, n);
+
+               /* Give preference to headless and unused nodes */
+               tmp = node_to_cpumask(n);
+               if (!cpus_empty(tmp))
+                       val += PENALTY_FOR_NODE_WITH_CPUS;
+
+               /* Slight preference for less loaded node */
+               val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+               val += node_load[n];
+
+               if (val < min_val) {
+                       min_val = val;
+                       best_node = n;
+               }
+       }
+
+       if (best_node >= 0)
+               set_bit(best_node, used_node_mask);
+
+       return best_node;
+}
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+       int i, j, k, node, local_node;
+       int prev_node, load;
+       struct zonelist *zonelist;
+       DECLARE_BITMAP(used_mask, MAX_NUMNODES);
+
+       /* initialize zonelists */
+       for (i = 0; i < MAX_NR_ZONES; i++) {
+               zonelist = pgdat->node_zonelists + i;
+               memset(zonelist, 0, sizeof(*zonelist));
+               zonelist->zones[0] = NULL;
+       }
+
+       /* NUMA-aware ordering of nodes */
+       local_node = pgdat->node_id;
+       load = numnodes;
+       prev_node = local_node;
+       bitmap_zero(used_mask, MAX_NUMNODES);
+       while ((node = find_next_best_node(local_node, used_mask)) >= 0) {
+               /*
+                * We don't want to pressure a particular node.
+                * So adding penalty to the first node in same
+                * distance group to make it round-robin.
+                */
+               if (node_distance(local_node, node) !=
+                               node_distance(local_node, prev_node))
+                       node_load[node] += load;
+               prev_node = node;
+               load--;
+               for (i = 0; i < MAX_NR_ZONES; i++) {
+                       zonelist = pgdat->node_zonelists + i;
+                       for (j = 0; zonelist->zones[j] != NULL; j++);
+
+                       k = ZONE_NORMAL;
+                       if (i & __GFP_HIGHMEM)
+                               k = ZONE_HIGHMEM;
+                       if (i & __GFP_DMA)
+                               k = ZONE_DMA;
+
+                       j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+                       zonelist->zones[j] = NULL;
+               }
+       }
+}
+
+#else  /* CONFIG_NUMA */
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+       int i, j, k, node, local_node;
+
+       local_node = pgdat->node_id;
+       for (i = 0; i < MAX_NR_ZONES; i++) {
+               struct zonelist *zonelist;
+
+               zonelist = pgdat->node_zonelists + i;
+               memset(zonelist, 0, sizeof(*zonelist));
+
+               j = 0;
+               k = ZONE_NORMAL;
+               if (i & __GFP_HIGHMEM)
+                       k = ZONE_HIGHMEM;
+               if (i & __GFP_DMA)
+                       k = ZONE_DMA;
+
+               j = build_zonelists_node(pgdat, zonelist, j, k);
+               /*
+                * Now we build the zonelist so that it contains the zones
+                * of all the other nodes.
+                * We don't want to pressure a particular node, so when
+                * building the zones for node N, we make sure that the
+                * zones coming right after the local ones are those from
+                * node N+1 (modulo N)
+                */
+               for (node = local_node + 1; node < numnodes; node++)
+                       j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+               for (node = 0; node < local_node; node++)
+                       j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+               zonelist->zones[j] = NULL;
+       }
+}
+
+#endif /* CONFIG_NUMA */
+
+void __init build_all_zonelists(void)
+{
+       int i;
+
+       for(i = 0 ; i < numnodes ; i++)
+               build_zonelists(NODE_DATA(i));
+       printk("Built %i zonelists\n", numnodes);
+}
+
+/*
+ * Helper functions to size the waitqueue hash table.
+ * Essentially these want to choose hash table sizes sufficiently
+ * large so that collisions trying to wait on pages are rare.
+ * But in fact, the number of active page waitqueues on typical
+ * systems is ridiculously low, less than 200. So this is even
+ * conservative, even though it seems large.
+ *
+ * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
+ * waitqueues, i.e. the size of the waitq table given the number of pages.
+ */
+#define PAGES_PER_WAITQUEUE    256
+
+static inline unsigned long wait_table_size(unsigned long pages)
+{
+       unsigned long size = 1;
+
+       pages /= PAGES_PER_WAITQUEUE;
+
+       while (size < pages)
+               size <<= 1;
+
+       /*
+        * Once we have dozens or even hundreds of threads sleeping
+        * on IO we've got bigger problems than wait queue collision.
+        * Limit the size of the wait table to a reasonable size.
+        */
+       size = min(size, 4096UL);
+
+       return max(size, 4UL);
+}
+
+/*
+ * This is an integer logarithm so that shifts can be used later
+ * to extract the more random high bits from the multiplicative
+ * hash function before the remainder is taken.
+ */
+static inline unsigned long wait_table_bits(unsigned long size)
+{
+       return ffz(~size);
+}
+
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
+static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
+               unsigned long *zones_size, unsigned long *zholes_size)
+{
+       unsigned long realtotalpages, totalpages = 0;
+       int i;
+
+       for (i = 0; i < MAX_NR_ZONES; i++)
+               totalpages += zones_size[i];
+       pgdat->node_spanned_pages = totalpages;
+
+       realtotalpages = totalpages;
+       if (zholes_size)
+               for (i = 0; i < MAX_NR_ZONES; i++)
+                       realtotalpages -= zholes_size[i];
+       pgdat->node_present_pages = realtotalpages;
+       printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
+}
+
+
+/*
+ * Initially all pages are reserved - free ones are freed
+ * up by free_all_bootmem() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ */
+void __init memmap_init_zone(struct page *start, unsigned long size, int nid,
+               unsigned long zone, unsigned long start_pfn)
+{
+       struct page *page;
+
+       for (page = start; page < (start + size); page++) {
+               set_page_zone(page, NODEZONE(nid, zone));
+               set_page_count(page, 0);
+               SetPageReserved(page);
+               INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+               /* The shift won't overflow because ZONE_NORMAL is below 4G. */
+               if (zone != ZONE_HIGHMEM)
+                       set_page_address(page, __va(start_pfn << PAGE_SHIFT));
+#endif
+               start_pfn++;
+       }
+}
+
+#ifndef __HAVE_ARCH_MEMMAP_INIT
+#define memmap_init(start, size, nid, zone, start_pfn) \
+       memmap_init_zone((start), (size), (nid), (zone), (start_pfn))
+#endif
+
+/*
+ * Set up the zone data structures:
+ *   - mark all pages reserved
+ *   - mark all memory queues empty
+ *   - clear the memory bitmaps
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat,
+               unsigned long *zones_size, unsigned long *zholes_size)
+{
+       unsigned long i, j;
+       const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
+       int cpu, nid = pgdat->node_id;
+       struct page *lmem_map = pgdat->node_mem_map;
+       unsigned long zone_start_pfn = pgdat->node_start_pfn;
+
+       pgdat->nr_zones = 0;
+       init_waitqueue_head(&pgdat->kswapd_wait);
+       
+       for (j = 0; j < MAX_NR_ZONES; j++) {
+               struct zone *zone = pgdat->node_zones + j;
+               unsigned long size, realsize;
+               unsigned long batch;
+
+               zone_table[NODEZONE(nid, j)] = zone;
+               realsize = size = zones_size[j];
+               if (zholes_size)
+                       realsize -= zholes_size[j];
+
+               zone->spanned_pages = size;
+               zone->present_pages = realsize;
+               zone->name = zone_names[j];
+               spin_lock_init(&zone->lock);
+               spin_lock_init(&zone->lru_lock);
+               zone->zone_pgdat = pgdat;
+               zone->free_pages = 0;
+
+               zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+
+               /*
+                * The per-cpu-pages pools are set to around 1000th of the
+                * size of the zone.  But no more than 1/4 of a meg - there's
+                * no point in going beyond the size of L2 cache.
+                *
+                * OK, so we don't know how big the cache is.  So guess.
+                */
+               batch = zone->present_pages / 1024;
+               if (batch * PAGE_SIZE > 256 * 1024)
+                       batch = (256 * 1024) / PAGE_SIZE;
+               batch /= 4;             /* We effectively *= 4 below */
+               if (batch < 1)
+                       batch = 1;
+
+               for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                       struct per_cpu_pages *pcp;
+
+                       pcp = &zone->pageset[cpu].pcp[0];       /* hot */
+                       pcp->count = 0;
+                       pcp->low = 2 * batch;
+                       pcp->high = 6 * batch;
+                       pcp->batch = 1 * batch;
+                       INIT_LIST_HEAD(&pcp->list);
+
+                       pcp = &zone->pageset[cpu].pcp[1];       /* cold */
+                       pcp->count = 0;
+                       pcp->low = 0;
+                       pcp->high = 2 * batch;
+                       pcp->batch = 1 * batch;
+                       INIT_LIST_HEAD(&pcp->list);
+               }
+               printk("  %s zone: %lu pages, LIFO batch:%lu\n",
+                               zone_names[j], realsize, batch);
+               INIT_LIST_HEAD(&zone->active_list);
+               INIT_LIST_HEAD(&zone->inactive_list);
+               atomic_set(&zone->nr_scan_active, 0);
+               atomic_set(&zone->nr_scan_inactive, 0);
+               zone->nr_active = 0;
+               zone->nr_inactive = 0;
+               if (!size)
+                       continue;
+
+               /*
+                * The per-page waitqueue mechanism uses hashed waitqueues
+                * per zone.
+                */
+               zone->wait_table_size = wait_table_size(size);
+               zone->wait_table_bits =
+                       wait_table_bits(zone->wait_table_size);
+               zone->wait_table = (wait_queue_head_t *)
+                       alloc_bootmem_node(pgdat, zone->wait_table_size
+                                               * sizeof(wait_queue_head_t));
+
+               for(i = 0; i < zone->wait_table_size; ++i)
+                       init_waitqueue_head(zone->wait_table + i);
+
+               pgdat->nr_zones = j+1;
+
+               zone->zone_mem_map = lmem_map;
+               zone->zone_start_pfn = zone_start_pfn;
+
+               if ((zone_start_pfn) & (zone_required_alignment-1))
+                       printk("BUG: wrong zone alignment, it will crash\n");
+
+               memmap_init(lmem_map, size, nid, j, zone_start_pfn);
+
+               zone_start_pfn += size;
+               lmem_map += size;
+
+               for (i = 0; ; i++) {
+                       unsigned long bitmap_size;
+
+                       INIT_LIST_HEAD(&zone->free_area[i].free_list);
+                       if (i == MAX_ORDER-1) {
+                               zone->free_area[i].map = NULL;
+                               break;
+                       }
+
+                       /*
+                        * Page buddy system uses "index >> (i+1)",
+                        * where "index" is at most "size-1".
+                        *
+                        * The extra "+3" is to round down to byte
+                        * size (8 bits per byte assumption). Thus
+                        * we get "(size-1) >> (i+4)" as the last byte
+                        * we can access.
+                        *
+                        * The "+1" is because we want to round the
+                        * byte allocation up rather than down. So
+                        * we should have had a "+7" before we shifted
+                        * down by three. Also, we have to add one as
+                        * we actually _use_ the last bit (it's [0,n]
+                        * inclusive, not [0,n[).
+                        *
+                        * So we actually had +7+1 before we shift
+                        * down by 3. But (n+8) >> 3 == (n >> 3) + 1
+                        * (modulo overflows, which we do not have).
+                        *
+                        * Finally, we LONG_ALIGN because all bitmap
+                        * operations are on longs.
+                        */
+                       bitmap_size = (size-1) >> (i+4);
+                       bitmap_size = LONG_ALIGN(bitmap_size+1);
+                       zone->free_area[i].map = 
+                         (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
+               }
+       }
+}
+
+void __init free_area_init_node(int nid, struct pglist_data *pgdat,
+               struct page *node_mem_map, unsigned long *zones_size,
+               unsigned long node_start_pfn, unsigned long *zholes_size)
+{
+       unsigned long size;
+
+       pgdat->node_id = nid;
+       pgdat->node_start_pfn = node_start_pfn;
+       calculate_zone_totalpages(pgdat, zones_size, zholes_size);
+       if (!node_mem_map) {
+               size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+               node_mem_map = alloc_bootmem_node(pgdat, size);
+       }
+       pgdat->node_mem_map = node_mem_map;
+
+       free_area_init_core(pgdat, zones_size, zholes_size);
+}
+
+#ifndef CONFIG_DISCONTIGMEM
+static bootmem_data_t contig_bootmem_data;
+struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
+
+EXPORT_SYMBOL(contig_page_data);
+
+void __init free_area_init(unsigned long *zones_size)
+{
+       free_area_init_node(0, &contig_page_data, NULL, zones_size,
+                       __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
+       mem_map = contig_page_data.node_mem_map;
+}
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/seq_file.h>
+
+static void *frag_start(struct seq_file *m, loff_t *pos)
+{
+       pg_data_t *pgdat;
+       loff_t node = *pos;
+
+       for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
+               --node;
+
+       return pgdat;
+}
+
+static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+       pg_data_t *pgdat = (pg_data_t *)arg;
+
+       (*pos)++;
+       return pgdat->pgdat_next;
+}
+
+static void frag_stop(struct seq_file *m, void *arg)
+{
+}
+
+/* 
+ * This walks the freelist for each zone. Whilst this is slow, I'd rather 
+ * be slow here than slow down the fast path by keeping stats - mjbligh
+ */
+static int frag_show(struct seq_file *m, void *arg)
+{
+       pg_data_t *pgdat = (pg_data_t *)arg;
+       struct zone *zone;
+       struct zone *node_zones = pgdat->node_zones;
+       unsigned long flags;
+       int order;
+
+       for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+               if (!zone->present_pages)
+                       continue;
+
+               spin_lock_irqsave(&zone->lock, flags);
+               seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+               for (order = 0; order < MAX_ORDER; ++order) {
+                       unsigned long nr_bufs = 0;
+                       struct list_head *elem;
+
+                       list_for_each(elem, &(zone->free_area[order].free_list))
+                               ++nr_bufs;
+                       seq_printf(m, "%6lu ", nr_bufs);
+               }
+               spin_unlock_irqrestore(&zone->lock, flags);
+               seq_putc(m, '\n');
+       }
+       return 0;
+}
+
+struct seq_operations fragmentation_op = {
+       .start  = frag_start,
+       .next   = frag_next,
+       .stop   = frag_stop,
+       .show   = frag_show,
+};
+
+static char *vmstat_text[] = {
+       "nr_dirty",
+       "nr_writeback",
+       "nr_unstable",
+       "nr_page_table_pages",
+       "nr_mapped",
+       "nr_slab",
+
+       "pgpgin",
+       "pgpgout",
+       "pswpin",
+       "pswpout",
+       "pgalloc_high",
+
+       "pgalloc_normal",
+       "pgalloc_dma",
+       "pgfree",
+       "pgactivate",
+       "pgdeactivate",
+
+       "pgfault",
+       "pgmajfault",
+       "pgrefill_high",
+       "pgrefill_normal",
+       "pgrefill_dma",
+
+       "pgsteal_high",
+       "pgsteal_normal",
+       "pgsteal_dma",
+       "pgscan_kswapd_high",
+       "pgscan_kswapd_normal",
+
+       "pgscan_kswapd_dma",
+       "pgscan_direct_high",
+       "pgscan_direct_normal",
+       "pgscan_direct_dma",
+       "pginodesteal",
+
+       "slabs_scanned",
+       "kswapd_steal",
+       "kswapd_inodesteal",
+       "pageoutrun",
+       "allocstall",
+
+       "pgrotated",
+};
+
+static void *vmstat_start(struct seq_file *m, loff_t *pos)
+{
+       struct page_state *ps;
+
+       if (*pos >= ARRAY_SIZE(vmstat_text))
+               return NULL;
+
+       ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+       m->private = ps;
+       if (!ps)
+               return ERR_PTR(-ENOMEM);
+       get_full_page_state(ps);
+       ps->pgpgin /= 2;                /* sectors -> kbytes */
+       ps->pgpgout /= 2;
+       return (unsigned long *)ps + *pos;
+}
+
+static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+       (*pos)++;
+       if (*pos >= ARRAY_SIZE(vmstat_text))
+               return NULL;
+       return (unsigned long *)m->private + *pos;
+}
+
+static int vmstat_show(struct seq_file *m, void *arg)
+{
+       unsigned long *l = arg;
+       unsigned long off = l - (unsigned long *)m->private;
+
+       seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
+       return 0;
+}
+
+static void vmstat_stop(struct seq_file *m, void *arg)
+{
+       kfree(m->private);
+       m->private = NULL;
+}
+
+struct seq_operations vmstat_op = {
+       .start  = vmstat_start,
+       .next   = vmstat_next,
+       .stop   = vmstat_stop,
+       .show   = vmstat_show,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int page_alloc_cpu_notify(struct notifier_block *self,
+                                unsigned long action, void *hcpu)
+{
+       int cpu = (unsigned long)hcpu;
+       long *count;
+
+       if (action == CPU_DEAD) {
+               /* Drain local pagecache count. */
+               count = &per_cpu(nr_pagecache_local, cpu);
+               atomic_add(*count, &nr_pagecache);
+               *count = 0;
+               local_irq_disable();
+               __drain_pages(cpu);
+               local_irq_enable();
+       }
+       return NOTIFY_OK;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void __init page_alloc_init(void)
+{
+       hotcpu_notifier(page_alloc_cpu_notify, 0);
+}
+
+static unsigned long higherzone_val(struct zone *z, int max_zone,
+                                       int alloc_type)
+{
+       int z_idx = zone_idx(z);
+       struct zone *higherzone;
+       unsigned long pages;
+
+       /* there is no higher zone to get a contribution from */
+       if (z_idx == MAX_NR_ZONES-1)
+               return 0;
+
+       higherzone = &z->zone_pgdat->node_zones[z_idx+1];
+
+       /* We always start with the higher zone's protection value */
+       pages = higherzone->protection[alloc_type];
+
+       /*
+        * We get a lower-zone-protection contribution only if there are
+        * pages in the higher zone and if we're not the highest zone
+        * in the current zonelist.  e.g., never happens for GFP_DMA. Happens
+        * only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA
+        * and ZONE_NORMAL for a GFP_HIGHMEM allocation.
+        */
+       if (higherzone->present_pages && z_idx < alloc_type)
+               pages += higherzone->pages_low * sysctl_lower_zone_protection;
+
+       return pages;
+}
+
+/*
+ * setup_per_zone_protection - called whenver min_free_kbytes or
+ *     sysctl_lower_zone_protection changes.  Ensures that each zone
+ *     has a correct pages_protected value, so an adequate number of
+ *     pages are left in the zone after a successful __alloc_pages().
+ *
+ *     This algorithm is way confusing.  I tries to keep the same behavior
+ *     as we had with the incremental min iterative algorithm.
+ */
+static void setup_per_zone_protection(void)
+{
+       struct pglist_data *pgdat;
+       struct zone *zones, *zone;
+       int max_zone;
+       int i, j;
+
+       for_each_pgdat(pgdat) {
+               zones = pgdat->node_zones;
+
+               for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++)
+                       if (zones[i].present_pages)
+                               max_zone = i;
+
+               /*
+                * For each of the different allocation types:
+                * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
+                */
+               for (i = 0; i < MAX_NR_ZONES; i++) {
+                       /*
+                        * For each of the zones:
+                        * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
+                        */
+                       for (j = MAX_NR_ZONES-1; j >= 0; j--) {
+                               zone = &zones[j];
+
+                               /*
+                                * We never protect zones that don't have memory
+                                * in them (j>max_zone) or zones that aren't in
+                                * the zonelists for a certain type of
+                                * allocation (j>i).  We have to assign these to
+                                * zero because the lower zones take
+                                * contributions from the higher zones.
+                                */
+                               if (j > max_zone || j > i) {
+                                       zone->protection[i] = 0;
+                                       continue;
+                               }
+                               /*
+                                * The contribution of the next higher zone
+                                */
+                               zone->protection[i] = higherzone_val(zone,
+                                                               max_zone, i);
+                               zone->protection[i] += zone->pages_low;
+                       }
+               }
+       }
+}
+
+/*
+ * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
+ *     that the pages_{min,low,high} values for each zone are set correctly 
+ *     with respect to min_free_kbytes.
+ */
+static void setup_per_zone_pages_min(void)
+{
+       unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+       unsigned long lowmem_pages = 0;
+       struct zone *zone;
+       unsigned long flags;
+
+       /* Calculate total number of !ZONE_HIGHMEM pages */
+       for_each_zone(zone) {
+               if (!is_highmem(zone))
+                       lowmem_pages += zone->present_pages;
+       }
+
+       for_each_zone(zone) {
+               spin_lock_irqsave(&zone->lru_lock, flags);
+               if (is_highmem(zone)) {
+                       /*
+                        * Often, highmem doesn't need to reserve any pages.
+                        * But the pages_min/low/high values are also used for
+                        * batching up page reclaim activity so we need a
+                        * decent value here.
+                        */
+                       int min_pages;
+
+                       min_pages = zone->present_pages / 1024;
+                       if (min_pages < SWAP_CLUSTER_MAX)
+                               min_pages = SWAP_CLUSTER_MAX;
+                       if (min_pages > 128)
+                               min_pages = 128;
+                       zone->pages_min = min_pages;
+               } else {
+                       /* if it's a lowmem zone, reserve a number of pages 
+                        * proportionate to the zone's size.
+                        */
+                       zone->pages_min = (pages_min * zone->present_pages) / 
+                                          lowmem_pages;
+               }
+
+               zone->pages_low = zone->pages_min * 2;
+               zone->pages_high = zone->pages_min * 3;
+               spin_unlock_irqrestore(&zone->lru_lock, flags);
+       }
+}
+
+/*
+ * Initialise min_free_kbytes.
+ *
+ * For small machines we want it small (128k min).  For large machines
+ * we want it large (16MB max).  But it is not linear, because network
+ * bandwidth does not increase linearly with machine size.  We use
+ *
+ *     min_free_kbytes = sqrt(lowmem_kbytes)
+ *
+ * which yields
+ *
+ * 16MB:       128k
+ * 32MB:       181k
+ * 64MB:       256k
+ * 128MB:      362k
+ * 256MB:      512k
+ * 512MB:      724k
+ * 1024MB:     1024k
+ * 2048MB:     1448k
+ * 4096MB:     2048k
+ * 8192MB:     2896k
+ * 16384MB:    4096k
+ */
+static int __init init_per_zone_pages_min(void)
+{
+       unsigned long lowmem_kbytes;
+
+       lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
+
+       min_free_kbytes = int_sqrt(lowmem_kbytes);
+       if (min_free_kbytes < 128)
+               min_free_kbytes = 128;
+       if (min_free_kbytes > 16384)
+               min_free_kbytes = 16384;
+       setup_per_zone_pages_min();
+       setup_per_zone_protection();
+       return 0;
+}
+module_init(init_per_zone_pages_min)
+
+/*
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
+ *     that we can call two helper functions whenever min_free_kbytes
+ *     changes.
+ */
+int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
+               struct file *file, void __user *buffer, size_t *length)
+{
+       proc_dointvec(table, write, file, buffer, length);
+       setup_per_zone_pages_min();
+       setup_per_zone_protection();
+       return 0;
+}
+
+/*
+ * lower_zone_protection_sysctl_handler - just a wrapper around
+ *     proc_dointvec() so that we can call setup_per_zone_protection()
+ *     whenever sysctl_lower_zone_protection changes.
+ */
+int lower_zone_protection_sysctl_handler(ctl_table *table, int write,
+                struct file *file, void __user *buffer, size_t *length)
+{
+       proc_dointvec_minmax(table, write, file, buffer, length);
+       setup_per_zone_protection();
+       return 0;
+}